removed BEGIN_OPENCV_DEVICE_NAMESPACE macros

2011-11-14 09:02:06 +00:00
parent d926541311
commit 0f53f2993e
73 changed files with 19272 additions and 19504 deletions
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -45,423 +45,421 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bf_radius_match {
-
-///////////////////////////////////////////////////////////////////////////////
-// Match Unrolled
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
-    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+namespace cv { namespace gpu { namespace device 
 {
-    #if __CUDA_ARCH__ >= 110
-
-    extern __shared__ int smem[];
-
-    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-    Dist dist;
-
-    #pragma unroll
-    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+    namespace bf_radius_match 
    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled

-        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-        if (loadX < query.cols)
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            T val;
+            #if __CUDA_ARCH__ >= 110

-            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+            extern __shared__ int smem[];

-            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+
+            #endif
        }

-        __syncthreads();
-
-        #pragma unroll
-        for (int j = 0; j < BLOCK_SIZE; ++j)
-            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-        __syncthreads();
-    }
-
-    float distVal = (typename Dist::result_type)dist;
-
-    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-    {
-        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-        if (ind < maxCount)
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
        {
-            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-            bestDistance.ptr(queryIdx)[ind] = distVal;
-        }
-    }
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));

-    #endif
-}
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);

-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );

-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }   

-    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
-        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}   
-
-template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> 
-void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-    for (int i = 0; i < n; ++i)
-    {
-        const DevMem2D_<T> train = trains[i];
-
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-        if (masks != 0 && masks[i].data)
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> 
+        void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            cudaStream_t stream)
        {
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-        }
-        else
-        {
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-        }
-        cudaSafeCall( cudaGetLastError() );
-    }
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);

-///////////////////////////////////////////////////////////////////////////////
-// Match
+            for (int i = 0; i < n; ++i)
+            {
+                const DevMem2D_<T> train = trains[i];

-template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
-    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-{
-    #if __CUDA_ARCH__ >= 110
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));

-    extern __shared__ int smem[];
+                if (masks != 0 && masks[i].data)
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }

-    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-    Dist dist;
-
-    for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
-    {
-        const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-        if (loadX < query.cols)
-        {
-            T val;
-
-            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        __syncthreads();
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match

-        #pragma unroll
-        for (int j = 0; j < BLOCK_SIZE; ++j)
-            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-        __syncthreads();
-    }
-
-    float distVal = (typename Dist::result_type)dist;
-
-    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-    {
-        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-        if (ind < maxCount)
+        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
-            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-            bestDistance.ptr(queryIdx)[ind] = distVal;
+            #if __CUDA_ARCH__ >= 110
+
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+
+            #endif
        }
-    }

-    #endif
-}
-
-template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-    match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
-        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template <int BLOCK_SIZE, typename Dist, typename T> 
-void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    cudaStream_t stream)
-{
-    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-    for (int i = 0; i < n; ++i)
-    {
-        const DevMem2D_<T> train = trains[i];
-
-        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-        if (masks != 0 && masks[i].data)
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            cudaStream_t stream)
        {
-            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        else
+
+        template <int BLOCK_SIZE, typename Dist, typename T> 
+        void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            cudaStream_t stream)
        {
-            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
-                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const DevMem2D_<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        cudaSafeCall( cudaGetLastError() );
-    }

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher

-///////////////////////////////////////////////////////////////////////////////
-// Match dispatcher
+        template <typename Dist, typename T, typename Mask> 
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+                             int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+        }

-template <typename Dist, typename T, typename Mask> 
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-                     const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-                     int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-    }
-}
+        template <typename Dist, typename T> 
+        void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+                             int cc, cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {            
+                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {            
+                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+        } 

-template <typename Dist, typename T> 
-void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-                     const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-                     int cc, cudaStream_t stream)
-{
-    if (query.cols <= 64)
-    {
-        matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 128)
-    {
-        matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    /*else if (query.cols <= 256)
-    {
-        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 512)
-    {            
-        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-    else if (query.cols <= 1024)
-    {            
-        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }*/
-    else
-    {
-        match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-    }
-} 
+        ///////////////////////////////////////////////////////////////////////////////
+        // Radius Match caller

-///////////////////////////////////////////////////////////////////////////////
-// Radius Match caller
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+        }

-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-}
+        template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);

-template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+        }

-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-}
+        //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);

-//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+            const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+                    trainIdx, distance, nMatches, 
+                    cc, stream);
+            }
+        }

-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    if (mask.data)
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-    else
-    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-            trainIdx, distance, nMatches, 
-            cc, stream);
-    }
-}
+        template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);

-template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+                trainIdx, imgIdx, distance, nMatches, 
+                cc, stream);
+        }

-template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-        trainIdx, imgIdx, distance, nMatches, 
-        cc, stream);
-}
+        template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);

-template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+                trainIdx, imgIdx, distance, nMatches, 
+                cc, stream);
+        }

-template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-        trainIdx, imgIdx, distance, nMatches, 
-        cc, stream);
-}
+        //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);

-//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+            const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+                trainIdx, imgIdx, distance, nMatches, 
+                cc, stream);
+        }

-template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-    int cc, cudaStream_t stream)
-{
-    matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-        trainIdx, imgIdx, distance, nMatches, 
-        cc, stream);
-}
-
-template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-} // namespace bf_radius_match
-
-END_OPENCV_DEVICE_NAMESPACE
+        template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+    } // namespace bf_radius_match
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,186 +43,184 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace bilateral_filter {
-
-__constant__ float* ctable_color;
-__constant__ float* ctable_space;
-__constant__ size_t ctable_space_step;
-
-__constant__ int cndisp;
-__constant__ int cradius;
-
-__constant__ short cedge_disc;
-__constant__ short cmax_disc;
-
-void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
-    cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
-    size_t table_space_step = table_space.step / sizeof(float);
-    cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
-
-    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
-
-    cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
-}
-
-template <int channels>
-struct DistRgbMax
-{
-    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+    namespace bilateral_filter 
    {
-        uchar x = ::abs(a[0] - b[0]);
-        uchar y = ::abs(a[1] - b[1]);
-        uchar z = ::abs(a[2] - b[2]);
-        return (::max(::max(x, y), z));
-    }
-};
+        __constant__ float* ctable_color;
+        __constant__ float* ctable_space;
+        __constant__ size_t ctable_space_step;

-template <>
-struct DistRgbMax<1>
-{
-    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-    {
-        return ::abs(a[0] - b[0]);
-    }
-};
+        __constant__ int cndisp;
+        __constant__ int cradius;

-template <int channels, typename T>
-__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-{
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+        __constant__ short cedge_disc;
+        __constant__ short cmax_disc;

-    T dp[5];
-
-    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-    {
-        dp[0] = *(disp + (y  ) * disp_step + x + 0);
-        dp[1] = *(disp + (y-1) * disp_step + x + 0);
-        dp[2] = *(disp + (y  ) * disp_step + x - 1);
-        dp[3] = *(disp + (y+1) * disp_step + x + 0);
-        dp[4] = *(disp + (y  ) * disp_step + x + 1);
-
-        if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            
+        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
        {
-            const int ymin = ::max(0, y - cradius);
-            const int xmin = ::max(0, x - cradius);
-            const int ymax = ::min(h - 1, y + cradius);
-            const int xmax = ::min(w - 1, x + cradius);
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
+            size_t table_space_step = table_space.step / sizeof(float);
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );

-            float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );

-            const uchar* ic = img + y * img_step + channels * x;
+            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
+        }

-            for(int yi = ymin; yi <= ymax; yi++)
+        template <int channels>
+        struct DistRgbMax
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
            {
-                const T* disp_y = disp + yi * disp_step;
+                uchar x = ::abs(a[0] - b[0]);
+                uchar y = ::abs(a[1] - b[1]);
+                uchar z = ::abs(a[2] - b[2]);
+                return (::max(::max(x, y), z));
+            }
+        };

-                for(int xi = xmin; xi <= xmax; xi++)
+        template <>
+        struct DistRgbMax<1>
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                return ::abs(a[0] - b[0]);
+            }
+        };
+
+        template <int channels, typename T>
+        __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+            T dp[5];
+
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                dp[0] = *(disp + (y  ) * disp_step + x + 0);
+                dp[1] = *(disp + (y-1) * disp_step + x + 0);
+                dp[2] = *(disp + (y  ) * disp_step + x - 1);
+                dp[3] = *(disp + (y+1) * disp_step + x + 0);
+                dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            
                {
-                    const uchar* in = img + yi * img_step + channels * xi;
+                    const int ymin = ::max(0, y - cradius);
+                    const int xmin = ::max(0, x - cradius);
+                    const int ymax = ::min(h - 1, y + cradius);
+                    const int xmax = ::min(w - 1, x + cradius);

-                    uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};

-                    const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
+                    const uchar* ic = img + y * img_step + channels * x;

-                    const T disp_reg = disp_y[xi];
+                    for(int yi = ymin; yi <= ymax; yi++)
+                    {
+                        const T* disp_y = disp + yi * disp_step;

-                    cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
-                    cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
-                    cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
-                    cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
-                    cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
+                        for(int xi = xmin; xi <= xmax; xi++)
+                        {
+                            const uchar* in = img + yi * img_step + channels * xi;
+
+                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+
+                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
+
+                            const T disp_reg = disp_y[xi];
+
+                            cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
+                            cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
+                            cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
+                            cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
+                            cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
+                        }
+                    }
+
+                    float minimum = numeric_limits<float>::max();
+                    int id = 0;
+
+                    if (cost[0] < minimum)
+                    {
+                        minimum = cost[0];
+                        id = 0;
+                    }
+                    if (cost[1] < minimum)
+                    {
+                        minimum = cost[1];
+                        id = 1;
+                    }
+                    if (cost[2] < minimum)
+                    {
+                        minimum = cost[2];
+                        id = 2;
+                    }
+                    if (cost[3] < minimum)
+                    {
+                        minimum = cost[3];
+                        id = 3;
+                    }
+                    if (cost[4] < minimum)
+                    {
+                        minimum = cost[4];
+                        id = 4;
+                    }
+
+                    *(disp + y * disp_step + x) = dp[id];
                }
            }
-
-            float minimum = numeric_limits<float>::max();
-            int id = 0;
-
-            if (cost[0] < minimum)
-            {
-                minimum = cost[0];
-                id = 0;
-            }
-            if (cost[1] < minimum)
-            {
-                minimum = cost[1];
-                id = 1;
-            }
-            if (cost[2] < minimum)
-            {
-                minimum = cost[2];
-                id = 2;
-            }
-            if (cost[3] < minimum)
-            {
-                minimum = cost[3];
-                id = 3;
-            }
-            if (cost[4] < minimum)
-            {
-                minimum = cost[4];
-                id = 4;
-            }
-
-            *(disp + y * disp_step + x) = dp[id];
        }
-    }
-}

-template <typename T>     
-void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-    grid.x = divUp(disp.cols, threads.x << 1);
-    grid.y = divUp(disp.rows, threads.y);
-
-    switch (channels)
-    {
-    case 1:
-        for (int i = 0; i < iters; ++i)
+        template <typename T>     
+        void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
        {
-            bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(disp.cols, threads.x << 1);
+            grid.y = divUp(disp.rows, threads.y);

-            bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
+            switch (channels)
+            {
+            case 1:
+                for (int i = 0; i < iters; ++i)
+                {
+                    bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            case 3:
+                for (int i = 0; i < iters; ++i)
+                {
+                    bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            default:
+                cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+            }
+
+            if (stream != 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        break;
-    case 3:
-        for (int i = 0; i < iters; ++i)
+
+        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
        {
-            bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-            cudaSafeCall( cudaGetLastError() );
+            bilateral_filter_caller(disp, img, channels, iters, stream);
        }
-        break;
-    default:
-        cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
-    }

-    if (stream != 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
-    bilateral_filter_caller(disp, img, channels, iters, stream);
-}
-
-void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
-{
-    bilateral_filter_caller(disp, img, channels, iters, stream);
-}
-
-} // namespace bilateral_filter
-
-END_OPENCV_DEVICE_NAMESPACE
+        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        {
+            bilateral_filter_caller(disp, img, channels, iters, stream);
+        }
+    } // namespace bilateral_filter
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -42,77 +42,75 @@

 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace blend {
-
-template <typename T>
-__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                  const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+namespace cv { namespace gpu { namespace device 
 {
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (y < rows && x < cols)
+    namespace blend 
    {
-        int x_ = x / cn;
-        float w1 = weights1.ptr(y)[x_];
-        float w2 = weights2.ptr(y)[x_];
-        T p1 = img1.ptr(y)[x];
-        T p2 = img2.ptr(y)[x];
-        result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-    }
-}	
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;

-template <typename T>
-void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-    
-    blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-    cudaSafeCall( cudaGetLastError() );
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }	

-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+            
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );

-template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);


-__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                      const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-{
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;

-    if (y < rows && x < cols)
-    {
-        float w1 = weights1.ptr(y)[x];
-        float w2 = weights2.ptr(y)[x];
-        float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-        w1 *= sum_inv;
-        w2 *= sum_inv;
-        uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-        uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-        ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                  p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-    }
-}
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }

-void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-{
-    dim3 threads(16, 16);
-    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-    
-    blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-    cudaSafeCall( cudaGetLastError() );
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+            
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );

-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-} // namespace blend 
-
-END_OPENCV_DEVICE_NAMESPACE
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend 
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -44,149 +44,148 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200
-
-namespace transform_points
+namespace cv { namespace gpu { namespace device 
 {
-    __constant__ float3 crot0;
-    __constant__ float3 crot1;
-    __constant__ float3 crot2;
-    __constant__ float3 ctransl;
+    #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200

-    struct TransformOp : unary_function<float3, float3>
+    namespace transform_points
    {
-        __device__ __forceinline__ float3 operator()(const float3& p) const
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+
+        struct TransformOp : unary_function<float3, float3>
        {
-            return make_float3(
-                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            __device__ __forceinline__ float3 operator()(const float3& p) const
+            {
+                return make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            }
+        };
+
+        void call(const DevMem2D_<float3> src, const float* rot,
+                  const float* transl, DevMem2D_<float3> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            ::cv::gpu::device::transform(src, dst, TransformOp(), stream);
        }
-    };
+    } // namespace transform_points

-    void call(const DevMem2D_<float3> src, const float* rot,
-              const float* transl, DevMem2D_<float3> dst,
-              cudaStream_t stream)
+    namespace project_points
    {
-        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
-    }
-} // namespace transform_points
+        __constant__ float3 crot0;
+        __constant__ float3 crot1;
+        __constant__ float3 crot2;
+        __constant__ float3 ctransl;
+        __constant__ float3 cproj0;
+        __constant__ float3 cproj1;

-namespace project_points
-{
-    __constant__ float3 crot0;
-    __constant__ float3 crot1;
-    __constant__ float3 crot2;
-    __constant__ float3 ctransl;
-    __constant__ float3 cproj0;
-    __constant__ float3 cproj1;
-
-    struct ProjectOp : unary_function<float3, float3>
-    {
-        __device__ __forceinline__ float2 operator()(const float3& p) const
+        struct ProjectOp : unary_function<float3, float3>
        {
-            // Rotate and translate in 3D
-            float3 t = make_float3(
-                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-            // Project on 2D plane
-            return make_float2(
-                    (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
-                    (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+            __device__ __forceinline__ float2 operator()(const float3& p) const
+            {
+                // Rotate and translate in 3D
+                float3 t = make_float3(
+                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+                // Project on 2D plane
+                return make_float2(
+                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
+                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+            }
+        };
+
+        void call(const DevMem2D_<float3> src, const float* rot,
+                  const float* transl, const float* proj, DevMem2D_<float2> dst,
+                  cudaStream_t stream)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
+            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
+            ::cv::gpu::device::transform(src, dst, ProjectOp(), stream);
        }
-    };
+    } // namespace project_points

-    void call(const DevMem2D_<float3> src, const float* rot,
-              const float* transl, const float* proj, DevMem2D_<float2> dst,
-              cudaStream_t stream)
+    namespace solve_pnp_ransac
    {
-        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
-        cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
-    }
-} // namespace project_points
+        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
+        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];

-namespace solve_pnp_ransac
-{
-    __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
-    __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
-
-    int maxNumIters()
-    {
-        return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
-    }
-
-    __device__ __forceinline__ float sqr(float x)
-    {
-        return x * x;
-    }
-
-    __global__ void computeHypothesisScoresKernel(
-            const int num_points, const float3* object, const float2* image,
-            const float dist_threshold, int* g_num_inliers)
-    {
-        const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
-        const float3 &transl_vec = ctransl_vectors[blockIdx.x];
-        int num_inliers = 0;
-
-        for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+        int maxNumIters()
        {
-            float3 p = object[i];
-            p = make_float3(
-                    rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
-                    rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
-                    rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
-            p.x /= p.z;
-            p.y /= p.z;
-            float2 image_p = image[i];
-            if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
-                ++num_inliers;
+            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
        }

-        extern __shared__ float s_num_inliers[];
-        s_num_inliers[threadIdx.x] = num_inliers;
-        __syncthreads();
-
-        for (int step = blockDim.x / 2; step > 0; step >>= 1)
+        __device__ __forceinline__ float sqr(float x)
        {
-            if (threadIdx.x < step)
-                s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
+            return x * x;
+        }
+
+        __global__ void computeHypothesisScoresKernel(
+                const int num_points, const float3* object, const float2* image,
+                const float dist_threshold, int* g_num_inliers)
+        {
+            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+            int num_inliers = 0;
+
+            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+            {
+                float3 p = object[i];
+                p = make_float3(
+                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+                p.x /= p.z;
+                p.y /= p.z;
+                float2 image_p = image[i];
+                if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+                    ++num_inliers;
+            }
+
+            extern __shared__ float s_num_inliers[];
+            s_num_inliers[threadIdx.x] = num_inliers;
            __syncthreads();
+
+            for (int step = blockDim.x / 2; step > 0; step >>= 1)
+            {
+                if (threadIdx.x < step)
+                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
+                __syncthreads();
+            }
+
+            if (threadIdx.x == 0)
+                g_num_inliers[blockIdx.x] = s_num_inliers[0];
        }

-        if (threadIdx.x == 0)
-            g_num_inliers[blockIdx.x] = s_num_inliers[0];
-    }
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores)
+        {
+            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));

-    void computeHypothesisScores(
-            const int num_hypotheses, const int num_points, const float* rot_matrices,
-            const float3* transl_vectors, const float3* object, const float2* image,
-            const float dist_threshold, int* hypothesis_scores)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
-        cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+            dim3 threads(256);
+            dim3 grid(num_hypotheses);
+            int smem_size = threads.x * sizeof(float);

-        dim3 threads(256);
-        dim3 grid(num_hypotheses);
-        int smem_size = threads.x * sizeof(float);
+            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+                    num_points, object, image, dist_threshold, hypothesis_scores);
+            cudaSafeCall( cudaGetLastError() );

-        computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
-                num_points, object, image, dist_threshold, hypothesis_scores);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-} // namespace solvepnp_ransac
-
-END_OPENCV_DEVICE_NAMESPACE
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace solvepnp_ransac
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -44,450 +44,448 @@
 #include <algorithm>
 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace canny {
-
-__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+namespace cv { namespace gpu { namespace device 
 {
-    __shared__ int smem[16][18];
-
-    const int j = blockIdx.x * blockDim.x + threadIdx.x;
-    const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i < rows)
+    namespace canny 
    {
-        smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
-        if (threadIdx.x == 0)
+        __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
        {
-            smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
-            smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
-        }
-        __syncthreads();
+            __shared__ int smem[16][18];

-        if (j < cols)
-        {
-            dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
-            dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
-        }
-    }
-}
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;

-void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-    calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-}
-
-struct L1
-{
-    static __device__ __forceinline__ float calc(int x, int y)
-    {
-        return ::abs(x) + ::abs(y);
-    }
-};
-struct L2
-{
-    static __device__ __forceinline__ float calc(int x, int y)
-    {
-        return ::sqrtf(x * x + y * y);
-    }
-};
-
-template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
-    PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-{
-    __shared__ int sdx[18][16];
-    __shared__ int sdy[18][16];
-
-    const int j = blockIdx.x * blockDim.x + threadIdx.x;
-    const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (j < cols)
-    {
-        sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
-        sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
-        if (threadIdx.y == 0)
-        {
-            sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
-            sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
-
-            sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
-            sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
-        }
-        __syncthreads();
-
-        if (i < rows)
-        {
-            int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
-            int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
-
-            dx.ptr(i)[j] = x;
-            dy.ptr(i)[j] = y;
-
-            mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
-        }
-    }
-}
-
-void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-    if (L2Grad)
-        calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
-    else
-        calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
-
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-}
-
-template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-{
-    const int j = blockIdx.x * blockDim.x + threadIdx.x;
-    const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i < rows && j < cols)
-        mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
-}
-
-void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-    if (L2Grad)
-        calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
-    else
-        calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
-
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-    
-#define CANNY_SHIFT 15
-#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
-
-__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-{
-    __shared__ float smem[18][18];
-
-    const int j = blockIdx.x * 16 + threadIdx.x;
-    const int i = blockIdx.y * 16 + threadIdx.y;
-
-    const int tid = threadIdx.y * 16 + threadIdx.x;
-    const int lx = tid % 18;
-    const int ly = tid / 18;
-
-    if (ly < 14)
-        smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-        smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
-
-    __syncthreads();
-
-    if (i < rows && j < cols)
-    {
-        int x = dx.ptr(i)[j];
-        int y = dy.ptr(i)[j];
-        const int s = (x ^ y) < 0 ? -1 : 1;
-        const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-        x = ::abs(x);
-        y = ::abs(y);
-
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-
-        if (m > low_thresh)
-        {
-            const int tg22x = x * TG22;
-            const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
-
-            y <<= CANNY_SHIFT;
-
-            if (y < tg22x)
+            if (i < rows)
            {
-                if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else if( y > tg67x )
-            {
-                if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-        }
-        
-        map.ptr(i + 1)[j + 1] = edge_type;
-    }
-}
-
-#undef CANNY_SHIFT
-#undef TG22
-
-void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-    calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-__device__ unsigned int counter = 0;
-
-__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
-{
-    #if __CUDA_ARCH__ >= 120
-
-    __shared__ int smem[18][18];
-
-    const int j = blockIdx.x * 16 + threadIdx.x;
-    const int i = blockIdx.y * 16 + threadIdx.y;
-
-    const int tid = threadIdx.y * 16 + threadIdx.x;
-    const int lx = tid % 18;
-    const int ly = tid / 18; 
-
-    if (ly < 14)
-        smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-        smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
-
-    __syncthreads();
-
-    if (i < rows && j < cols)
-    {
-        int n;
-
-        #pragma unroll
-        for (int k = 0; k < 16; ++k)
-        {
-            n = 0;
-
-            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-            {
-                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-                
-                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-                
-                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-            }
-
-            if (n > 0)
-                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-        }
-
-        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-        map.ptr(i + 1)[j + 1] = e;
-
-        n = 0;
-
-        if (e == 2)
-        {
-            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-            
-            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-            
-            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
-        }
-
-        if (n > 0)
-        {
-            const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
-            st[ind] = make_ushort2(j + 1, i + 1);
-        }
-    }
-
-    #endif
-}
-
-void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-    edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
-    cudaSafeCall( cudaGetLastError() );
-
-    cudaSafeCall(cudaThreadSynchronize());
-}
-
-__constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-__constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
-{
-    #if __CUDA_ARCH__ >= 120
-
-    const int stack_size = 512;
-    
-    __shared__ unsigned int s_counter;
-    __shared__ unsigned int s_ind;
-    __shared__ ushort2 s_st[stack_size];
-
-    if (threadIdx.x == 0)
-        s_counter = 0;
-    __syncthreads();
-
-    int ind = blockIdx.y * gridDim.x + blockIdx.x;
-
-    if (ind < count)
-    {
-        ushort2 pos = st1[ind];
-
-        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
-        {
-            if (threadIdx.x < 8)
-            {
-                pos.x += c_dx[threadIdx.x];
-                pos.y += c_dy[threadIdx.x];
-
-                if (map.ptr(pos.y)[pos.x] == 1)
+                smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
+                if (threadIdx.x == 0)
                {
-                    map.ptr(pos.y)[pos.x] = 2;
+                    smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
+                    smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
+                }
+                __syncthreads();

-                    ind = atomicInc(&s_counter, (unsigned int)(-1));
-
-                    s_st[ind] = pos;
+                if (j < cols)
+                {
+                    dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
+                    dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
                }
            }
+        }
+
+        void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+            calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
+        }
+
+        struct L1
+        {
+            static __device__ __forceinline__ float calc(int x, int y)
+            {
+                return ::abs(x) + ::abs(y);
+            }
+        };
+        struct L2
+        {
+            static __device__ __forceinline__ float calc(int x, int y)
+            {
+                return ::sqrtf(x * x + y * y);
+            }
+        };
+
+        template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+            PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+        {
+            __shared__ int sdx[18][16];
+            __shared__ int sdy[18][16];
+
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (j < cols)
+            {
+                sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
+                sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
+                if (threadIdx.y == 0)
+                {
+                    sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
+                    sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];
+
+                    sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
+                    sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
+                }
+                __syncthreads();
+
+                if (i < rows)
+                {
+                    int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
+                    int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
+
+                    dx.ptr(i)[j] = x;
+                    dy.ptr(i)[j] = y;
+
+                    mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
+                }
+            }
+        }
+
+        void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+            if (L2Grad)
+                calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+            else
+                calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
+        }
+
+        template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+        {
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i < rows && j < cols)
+                mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+        }
+
+        void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+            if (L2Grad)
+                calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
+            else
+                calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
+        }
+
+        //////////////////////////////////////////////////////////////////////////////////////////
+            
+        #define CANNY_SHIFT 15
+        #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+
+        __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+        {
+            __shared__ float smem[18][18];
+
+            const int j = blockIdx.x * 16 + threadIdx.x;
+            const int i = blockIdx.y * 16 + threadIdx.y;
+
+            const int tid = threadIdx.y * 16 + threadIdx.x;
+            const int lx = tid % 18;
+            const int ly = tid / 18;
+
+            if (ly < 14)
+                smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+
+            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+                smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+
            __syncthreads();

-            while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+            if (i < rows && j < cols)
            {
-                const int subTaskIdx = threadIdx.x >> 3;
-                const int portion = ::min(s_counter, blockDim.x >> 3);
+                int x = dx.ptr(i)[j];
+                int y = dy.ptr(i)[j];
+                const int s = (x ^ y) < 0 ? -1 : 1;
+                const float m = smem[threadIdx.y + 1][threadIdx.x + 1];

-                pos.x = pos.y = 0;
+                x = ::abs(x);
+                y = ::abs(y);

-                if (subTaskIdx < portion)
-                    pos = s_st[s_counter - 1 - subTaskIdx];
-                __syncthreads();
-                    
-                if (threadIdx.x == 0)
-                    s_counter -= portion;
-                __syncthreads();
-                 
-                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                // 0 - the pixel can not belong to an edge
+                // 1 - the pixel might belong to an edge
+                // 2 - the pixel does belong to an edge
+                int edge_type = 0;
+
+                if (m > low_thresh)
                {
-                    pos.x += c_dx[threadIdx.x & 7];
-                    pos.y += c_dy[threadIdx.x & 7];
+                    const int tg22x = x * TG22;
+                    const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);

-                    if (map.ptr(pos.y)[pos.x] == 1)
+                    y <<= CANNY_SHIFT;
+
+                    if (y < tg22x)
                    {
-                        map.ptr(pos.y)[pos.x] = 2;
-
-                        ind = atomicInc(&s_counter, (unsigned int)(-1));
-
-                        s_st[ind] = pos;
+                        if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
+                            edge_type = 1 + (int)(m > high_thresh);
+                    }
+                    else if( y > tg67x )
+                    {
+                        if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
+                            edge_type = 1 + (int)(m > high_thresh);
+                    }
+                    else
+                    {
+                        if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
+                            edge_type = 1 + (int)(m > high_thresh);
                    }
                }
-                __syncthreads();
-            }
-
-            if (s_counter > 0)
-            {
-                if (threadIdx.x == 0)
-                {
-                    ind = atomicAdd(&counter, s_counter);
-                    s_ind = ind - s_counter;
-                }
-                __syncthreads();
-
-                ind = s_ind;
-
-                for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                {
-                    st2[ind + i] = s_st[i];
-                }
+                
+                map.ptr(i + 1)[j + 1] = edge_type;
            }
        }
-    }

-    #endif
-}
+        #undef CANNY_SHIFT
+        #undef TG22

-void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
-{
-    void* counter_ptr;
-    cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-    
-    unsigned int count;
-    cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+        void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

-    while (count > 0)
-    {
-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+            calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
+            cudaSafeCall( cudaGetLastError() );

-        dim3 block(128, 1, 1);
-        dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
-        edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
-        cudaSafeCall( cudaGetLastError() );
+            cudaSafeCall(cudaThreadSynchronize());
+        }

-        cudaSafeCall(cudaThreadSynchronize());
+        //////////////////////////////////////////////////////////////////////////////////////////

-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+        __device__ unsigned int counter = 0;

-        std::swap(st1, st2);
-    }
-}
+        __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+        {
+            #if __CUDA_ARCH__ >= 120

-__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
-{
-    const int j = blockIdx.x * 16 + threadIdx.x;
-    const int i = blockIdx.y * 16 + threadIdx.y;
+            __shared__ int smem[18][18];

-    if (i < rows && j < cols)
-        dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
-}
+            const int j = blockIdx.x * 16 + threadIdx.x;
+            const int i = blockIdx.y * 16 + threadIdx.y;

-void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
-{
-    dim3 block(16, 16, 1);
-    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+            const int tid = threadIdx.y * 16 + threadIdx.x;
+            const int lx = tid % 18;
+            const int ly = tid / 18; 

-    getEdges<<<grid, block>>>(map, dst, rows, cols);
-    cudaSafeCall( cudaGetLastError() );
+            if (ly < 14)
+                smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];

-    cudaSafeCall(cudaThreadSynchronize());
-}
+            if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+                smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];

-} // namespace canny
+            __syncthreads();

-END_OPENCV_DEVICE_NAMESPACE
+            if (i < rows && j < cols)
+            {
+                int n;
+
+                #pragma unroll
+                for (int k = 0; k < 16; ++k)
+                {
+                    n = 0;
+
+                    if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+                    {
+                        n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                        n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                        n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+                        
+                        n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                        n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+                        
+                        n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                        n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                        n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+                    }
+
+                    if (n > 0)
+                        smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+                }
+
+                const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+                map.ptr(i + 1)[j + 1] = e;
+
+                n = 0;
+
+                if (e == 2)
+                {
+                    n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+                    
+                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+                    
+                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+                }
+
+                if (n > 0)
+                {
+                    const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
+                    st[ind] = make_ushort2(j + 1, i + 1);
+                }
+            }
+
+            #endif
+        }
+
+        void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+            edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
+        }
+
+        __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+        __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+        __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
+        {
+            #if __CUDA_ARCH__ >= 120
+
+            const int stack_size = 512;
+            
+            __shared__ unsigned int s_counter;
+            __shared__ unsigned int s_ind;
+            __shared__ ushort2 s_st[stack_size];
+
+            if (threadIdx.x == 0)
+                s_counter = 0;
+            __syncthreads();
+
+            int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+            if (ind < count)
+            {
+                ushort2 pos = st1[ind];
+
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                {
+                    if (threadIdx.x < 8)
+                    {
+                        pos.x += c_dx[threadIdx.x];
+                        pos.y += c_dy[threadIdx.x];
+
+                        if (map.ptr(pos.y)[pos.x] == 1)
+                        {
+                            map.ptr(pos.y)[pos.x] = 2;
+
+                            ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                            s_st[ind] = pos;
+                        }
+                    }
+                    __syncthreads();
+
+                    while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+                    {
+                        const int subTaskIdx = threadIdx.x >> 3;
+                        const int portion = ::min(s_counter, blockDim.x >> 3);
+
+                        pos.x = pos.y = 0;
+
+                        if (subTaskIdx < portion)
+                            pos = s_st[s_counter - 1 - subTaskIdx];
+                        __syncthreads();
+                            
+                        if (threadIdx.x == 0)
+                            s_counter -= portion;
+                        __syncthreads();
+                         
+                        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                        {
+                            pos.x += c_dx[threadIdx.x & 7];
+                            pos.y += c_dy[threadIdx.x & 7];
+
+                            if (map.ptr(pos.y)[pos.x] == 1)
+                            {
+                                map.ptr(pos.y)[pos.x] = 2;
+
+                                ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                                s_st[ind] = pos;
+                            }
+                        }
+                        __syncthreads();
+                    }
+
+                    if (s_counter > 0)
+                    {
+                        if (threadIdx.x == 0)
+                        {
+                            ind = atomicAdd(&counter, s_counter);
+                            s_ind = ind - s_counter;
+                        }
+                        __syncthreads();
+
+                        ind = s_ind;
+
+                        for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                        {
+                            st2[ind + i] = s_st[i];
+                        }
+                    }
+                }
+            }
+
+            #endif
+        }
+
+        void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
+            
+            unsigned int count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+            while (count > 0)
+            {
+                cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
+
+                dim3 block(128, 1, 1);
+                dim3 grid(std::min(count, 65535u), divUp(count, 65535), 1);
+                edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall(cudaThreadSynchronize());
+
+                cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+                std::swap(st1, st2);
+            }
+        }
+
+        __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
+        {
+            const int j = blockIdx.x * 16 + threadIdx.x;
+            const int i = blockIdx.y * 16 + threadIdx.y;
+
+            if (i < rows && j < cols)
+                dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
+        }
+
+        void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
+        {
+            dim3 block(16, 16, 1);
+            dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+            getEdges<<<grid, block>>>(map, dst, rows, cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall(cudaThreadSynchronize());
+        }
+    } // namespace canny
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -44,181 +44,181 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/color.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_x = 8 };
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-{
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-{
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };    

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};    
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
-DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-{
-    enum { smart_block_dim_y = 8 };
-    enum { smart_shift = 4 };
-};
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
@@ -226,7 +226,7 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
        traits::functor_type functor = traits::create_functor(); \
        typedef typename traits::functor_type::argument_type src_t; \
        typedef typename traits::functor_type::result_type   dst_t; \
-        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
+        ::cv::gpu::device::transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
    }

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
@@ -243,138 +243,137 @@ DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)

-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
-OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)

-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-
-END_OPENCV_DEVICE_NAMESPACE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -47,203 +47,201 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define MAX_KERNEL_SIZE 16
-#define BLOCK_DIM_X 16
-#define BLOCK_DIM_Y 4
-#define RESULT_STEPS 8
-#define HALO_STEPS 1
-
-namespace column_filter {
-
-__constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-void loadKernel(const float kernel[], int ksize)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
-}
+    #define MAX_KERNEL_SIZE 16
+    #define BLOCK_DIM_X 16
+    #define BLOCK_DIM_Y 4
+    #define RESULT_STEPS 8
+    #define HALO_STEPS 1

-template <int KERNEL_SIZE, typename T, typename D, typename B>
-__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
-{
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-    __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
-
-    //Offset to the upper halo edge
-    const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-    const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
-
-    if (x < src.cols)
+    namespace column_filter 
    {
-        const T* src_col = src.ptr() + x;
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];

-        //Main data
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
-
-        //Upper halo
-        #pragma unroll
-        for(int i = 0; i < HALO_STEPS; ++i)
-            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
-
-        //Lower halo
-        #pragma unroll
-        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
-            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
-
-        __syncthreads();
-
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+        void loadKernel(const float kernel[], int ksize)
        {
-            sum_t sum = VecTraits<sum_t>::all(0);
-
-            #pragma unroll
-            for(int j = 0; j < KERNEL_SIZE; ++j)
-                sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
-
-            int dstY = y + i * BLOCK_DIM_Y;
-
-            if (dstY < src.rows)
-                dst.ptr(dstY)[x] = saturate_cast<D>(sum);
+            cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
        }
-    }
-}

-template <int ksize, typename T, typename D, template<typename> class B>
-void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
-{        
-    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-    const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
+        template <int KERNEL_SIZE, typename T, typename D, typename B>
+        __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

-    B<T> b(src.rows);
+            __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];

-    linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
-    cudaSafeCall( cudaGetLastError() );
+            //Offset to the upper halo edge
+            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+            const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (x < src.cols)
+            {
+                const T* src_col = src.ptr() + x;

-template <typename T, typename D>
-void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-    static const caller_t callers[5][17] = 
-    {
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
-            linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
-            linearColumnFilter_caller<16, T, D, BrdColReflect101> 
-        },
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
-            linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
-            linearColumnFilter_caller<16, T, D, BrdColReplicate>
-        },
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<2 , T, D, BrdColConstant>,
-            linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
-            linearColumnFilter_caller<10, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<11, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<12, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<13, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<14, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<15, T, D, BrdColConstant>, 
-            linearColumnFilter_caller<16, T, D, BrdColConstant> 
-        },
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<2 , T, D, BrdColReflect>,
-            linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
-            linearColumnFilter_caller<10, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<11, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<12, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<13, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<14, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<15, T, D, BrdColReflect>, 
-            linearColumnFilter_caller<16, T, D, BrdColReflect>
-        },
-        {
-            0, 
-            linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<2 , T, D, BrdColWrap>,
-            linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
-            linearColumnFilter_caller<10, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<11, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<12, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<13, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<14, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<15, T, D, BrdColWrap>, 
-            linearColumnFilter_caller<16, T, D, BrdColWrap>,
+                //Main data
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                    smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
+
+                //Upper halo
+                #pragma unroll
+                for(int i = 0; i < HALO_STEPS; ++i)
+                    smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
+
+                //Lower halo
+                #pragma unroll
+                for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
+                    smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
+
+                __syncthreads();
+
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
+
+                    #pragma unroll
+                    for(int j = 0; j < KERNEL_SIZE; ++j)
+                        sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
+
+                    int dstY = y + i * BLOCK_DIM_Y;
+
+                    if (dstY < src.rows)
+                        dst.ptr(dstY)[x] = saturate_cast<D>(sum);
+                }
+            }
        }
-    };
-    
-    loadKernel(kernel, ksize);

-    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-}
+        template <int ksize, typename T, typename D, template<typename> class B>
+        void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+        {        
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));

-template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+            B<T> b(src.rows);

-} // namespace column_filter
+            linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
+            cudaSafeCall( cudaGetLastError() );

-END_OPENCV_DEVICE_NAMESPACE
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T, typename D>
+        void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+            static const caller_t callers[5][17] = 
+            {
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
+                    linearColumnFilter_caller<16, T, D, BrdColReflect101> 
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
+                    linearColumnFilter_caller<16, T, D, BrdColReplicate>
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColConstant>,
+                    linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<10, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<11, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<12, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<13, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<14, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<15, T, D, BrdColConstant>, 
+                    linearColumnFilter_caller<16, T, D, BrdColConstant> 
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColReflect>,
+                    linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<10, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<11, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<12, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<13, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<14, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<15, T, D, BrdColReflect>, 
+                    linearColumnFilter_caller<16, T, D, BrdColReflect>
+                },
+                {
+                    0, 
+                    linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<2 , T, D, BrdColWrap>,
+                    linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<10, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<11, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<12, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<13, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<14, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<15, T, D, BrdColWrap>, 
+                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
+                }
+            };
+            
+            loadKernel(kernel, ksize);
+
+            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+        }
+
+        template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    } // namespace column_filter
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -43,87 +43,85 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
+namespace cv { namespace gpu { namespace device 
 {
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < dst.cols && y < dst.rows)
-        dst.ptr(y)[x] = src(y - top, x - left);
-}
-
-template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
-{
-    static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
-        const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-    {        
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
-        BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
-
-        copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
-
-template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
-    const T* borderValue, cudaStream_t stream)
-{
-    typedef typename TypeVec<T, cn>::vec_type vec_type;
-
-    typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
-
-    static const caller_t callers[5] = 
+    namespace imgproc 
    {
-        CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
-        CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
-    };
+        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;

-    callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
-}
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = src(y - top, x - left);
+        }

-template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+        {
+            static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
+                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
+            {        
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
+                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);

-template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
+                cudaSafeCall( cudaGetLastError() );

-template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };

-//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
+            const T* borderValue, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type vec_type;

-template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+            typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);

-} // namespace imgproc
+            static const caller_t callers[5] = 
+            {
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
+            };

-END_OPENCV_DEVICE_NAMESPACE
+            callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
+        }
+
+        template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -45,177 +45,175 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
+namespace cv { namespace gpu { namespace device 
+{
+    #define UINT_BITS 32U

-#define UINT_BITS 32U
+    //Warps == subhistograms per threadblock
+    #define WARP_COUNT 6

-//Warps == subhistograms per threadblock
-#define WARP_COUNT 6
+    //Threadblock size
+    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
+    #define HISTOGRAM256_BIN_COUNT 256

-//Threadblock size
-#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-#define HISTOGRAM256_BIN_COUNT 256
+    //Shared memory per threadblock
+    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)

-//Shared memory per threadblock
-#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
+    #define PARTIAL_HISTOGRAM256_COUNT 240

-#define PARTIAL_HISTOGRAM256_COUNT 240
+    #define MERGE_THREADBLOCK_SIZE 256

-#define MERGE_THREADBLOCK_SIZE 256
+    #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)

-#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
-
-namespace hist {
-
-#if (!USE_SMEM_ATOMICS)
-
-    #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
-
-    __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+    namespace hist 
    {
-        uint count;
-        do
+        #if (!USE_SMEM_ATOMICS)
+
+            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
+
+            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+            {
+                uint count;
+                do
+                {
+                    count = s_WarpHist[data] & TAG_MASK;
+                    count = threadTag | (count + 1);
+                    s_WarpHist[data] = count;
+                } while (s_WarpHist[data] != count);
+            }
+
+        #else
+
+            #define TAG_MASK 0xFFFFFFFFU
+
+            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+            {
+                atomicAdd(s_WarpHist + data, 1);
+            }
+
+        #endif
+
+        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
        {
-            count = s_WarpHist[data] & TAG_MASK;
-            count = threadTag | (count + 1);
-            s_WarpHist[data] = count;
-        } while (s_WarpHist[data] != count);
-    }
+            uint x = pos_x << 2;

-#else
+            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
+            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
+            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
+            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
+        }

-    #define TAG_MASK 0xFFFFFFFFU
+        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+        {
+            //Per-warp subhistogram storage
+            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
+            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;

-    __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
-    {
-        atomicAdd(s_WarpHist + data, 1);
-    }
+            //Clear shared memory storage for current threadblock before processing
+            #pragma unroll
+            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
+               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;

-#endif
+            //Cycle through the entire data set, update subhistograms for each warp
+            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);

-__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-{
-    uint x = pos_x << 2;
+            __syncthreads();
+            const uint colsui = d_Data.step / sizeof(uint);
+            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
+            {
+                uint pos_y = pos / colsui;
+                uint pos_x = pos % colsui;
+                uint data = d_Data.ptr(pos_y)[pos_x];
+                addWord(s_WarpHist, data, tag, pos_x, cols);
+            }

-    if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-    if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-    if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-    if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
-}
+            //Merge per-warp histograms into per-block and write to global memory
+            __syncthreads();
+            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
+            {
+                uint sum = 0;

-__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-{
-    //Per-warp subhistogram storage
-    __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-    uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
+                for (uint i = 0; i < WARP_COUNT; i++)
+                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;

-    //Clear shared memory storage for current threadblock before processing
-    #pragma unroll
-    for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-       s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
+                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
+            }
+        }

-    //Cycle through the entire data set, update subhistograms for each warp
-    const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
+        ////////////////////////////////////////////////////////////////////////////////
+        // Merge histogram256() output
+        // Run one threadblock per bin; each threadblock adds up the same bin counter
+        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
+        // takes only a fraction of total processing time
+        ////////////////////////////////////////////////////////////////////////////////

-    __syncthreads();
-    const uint colsui = d_Data.step / sizeof(uint);
-    for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-    {
-        uint pos_y = pos / colsui;
-        uint pos_x = pos % colsui;
-        uint data = d_Data.ptr(pos_y)[pos_x];
-        addWord(s_WarpHist, data, tag, pos_x, cols);
-    }
+        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
+        {
+            uint sum = 0;

-    //Merge per-warp histograms into per-block and write to global memory
-    __syncthreads();
-    for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-    {
-        uint sum = 0;
+            #pragma unroll
+            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
+                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];

-        for (uint i = 0; i < WARP_COUNT; i++)
-            sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
+            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
+            data[threadIdx.x] = sum;

-        d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-    }
-}
+            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
+            {
+                __syncthreads();
+                if(threadIdx.x < stride)
+                    data[threadIdx.x] += data[threadIdx.x + stride];
+            }

-////////////////////////////////////////////////////////////////////////////////
-// Merge histogram256() output
-// Run one threadblock per bin; each threadblock adds up the same bin counter
-// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-// takes only a fraction of total processing time
-////////////////////////////////////////////////////////////////////////////////
+            if(threadIdx.x == 0)
+                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
+        }

-__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-{
-    uint sum = 0;
+        void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
+        {
+            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
+                DevMem2D_<uint>(src),
+                buf, 
+                static_cast<uint>(src.rows * src.step / sizeof(uint)),
+                src.cols);

-    #pragma unroll
-    for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-        sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
+            cudaSafeCall( cudaGetLastError() );

-    __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-    data[threadIdx.x] = sum;
+            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);

-    for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-    {
-        __syncthreads();
-        if(threadIdx.x < stride)
-            data[threadIdx.x] += data[threadIdx.x + stride];
-    }
+            cudaSafeCall( cudaGetLastError() );

-    if(threadIdx.x == 0)
-        d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }

-void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
-{
-    histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-        DevMem2D_<uint>(src),
-        buf, 
-        static_cast<uint>(src.rows * src.step / sizeof(uint)),
-        src.cols);
+        __constant__ int c_lut[256];

-    cudaSafeCall( cudaGetLastError() );
+        __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-    mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
+            if (x < src.cols && y < src.rows)
+            {
+                const uchar val = src.ptr(y)[x];
+                const int lut = c_lut[val];
+                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
+            }
+        }

-    cudaSafeCall( cudaGetLastError() );
+        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
+        {
+            dim3 block(16, 16);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );

-__constant__ int c_lut[256];
+            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );

-__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (x < src.cols && y < src.rows)
-    {
-        const uchar val = src.ptr(y)[x];
-        const int lut = c_lut[val];
-        dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
-    }
-}
-
-void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
-{
-    dim3 block(16, 16);
-    dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-    cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-
-    equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-} // namespace hist
-
-END_OPENCV_DEVICE_NAMESPACE
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace hist
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -50,7 +50,7 @@
 #include "safe_call.hpp"

 #ifndef CV_PI
-#define CV_PI   3.1415926535897932384626433832795f
+#define CV_PI   3.1415926535897932384626433832795
 #endif

 #ifndef CV_PI_F
@@ -61,27 +61,21 @@
  #endif
 #endif

-#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device { 
-#define END_OPENCV_DEVICE_NAMESPACE   }}}
-#define OPENCV_DEVICE_NAMESPACE       ::cv::gpu::device
-#define OPENCV_DEVICE_NAMESPACE_      ::cv::gpu::device:: 
-
 #ifdef __CUDACC__

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-typedef signed char schar;
-typedef unsigned int uint;
-
-template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-}
+    typedef unsigned char uchar;
+    typedef unsigned short ushort;
+    typedef signed char schar;
+    typedef unsigned int uint;

-END_OPENCV_DEVICE_NAMESPACE
+    template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
+    {
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+    }
+}}}

 #endif

@@ -102,87 +96,6 @@ namespace cv { namespace gpu

    static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }

-    /*template<class T> static inline void uploadConstant(const char* name, const T& value) 
-    { 
-        cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); 
-    }
-
-    template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) 
-    {
-        cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); 
-    }   */     
-
-    //template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)
-    //{            
-    //    //!!!! const_cast is disabled!
-    //    //!!!! Please use constructor of 'class texture'  instead.
-    //
-    //    //textureReference* tex; 
-    //    //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); 
-    //    //tex->normalized = normalized;
-    //    //tex->filterMode = filterMode;
-    //    //tex->addressMode[0] = addrMode;
-    //    //tex->addressMode[1] = addrMode;
-    //    
-    //    const textureReference* tex; 
-    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-    //
-    //    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    //    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-    //}
-
-    //static inline void unbindTexture(const char *name)
-    //{
-    //    const textureReference* tex; 
-    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-    //    cudaSafeCall( cudaUnbindTexture(tex) );
-    //}
-
-    
-
-    //class TextureBinder
-    //{
-    //public:
-    //    TextureBinder() : tex_(0) {}
-    //    template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
-    //    {
-    //        bind(tex, img);
-    //    }
-    //    template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
-    //    {
-    //        bind(tex_name, img);
-    //    }
-    //    ~TextureBinder() { unbind(); }
-    //
-    //    template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
-    //    {
-    //        unbind();
-    //
-    //        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-    //        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-    //
-    //        tex_ = tex;
-    //    }
-    //    template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
-    //    {
-    //        const textureReference* tex; 
-    //        cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); 
-    //        bind(tex, img);
-    //    }
-    //
-    //    void unbind()
-    //    {
-    //        if (tex_)
-    //        {
-    //            cudaUnbindTexture(tex_);
-    //            tex_ = 0;
-    //        }
-    //    }
-    //
-    //private:
-    //    const textureReference* tex_;
-    //};
-
    class NppStreamHandler
    {
    public:
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -42,174 +42,172 @@

 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace mathfunc {
-
-//////////////////////////////////////////////////////////////////////////////////////
-// Cart <-> Polar
-
-struct Nothing
+namespace cv { namespace gpu { namespace device 
 {
-    static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
+    namespace mathfunc 
    {
-    }
-};
-struct Magnitude
-{
-    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-    {
-        dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
-    }
-};
-struct MagnitudeSqr
-{
-    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-    {
-        dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
-    }
-};
-struct Atan2
-{
-    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
-    {
-        float angle = ::atan2f(y_data, x_data);
-        angle += (angle < 0) * 2.0 * CV_PI;
-        dst[y * dst_step + x] = scale * angle;
-    }
-};
-template <typename Mag, typename Angle>
-__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
-                            float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-{
-	const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Cart <-> Polar

-    if (x < width && y < height)
-    {
-        float x_data = xptr[y * x_step + x];
-        float y_data = yptr[y * y_step + x];
-
-        Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
-        Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
-    }
-}
-
-struct NonEmptyMag
-{
-    static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
-    {
-        return mag[y * mag_step + x];
-    }
-};
-struct EmptyMag
-{
-    static __device__ __forceinline__ float get(const float*, size_t, int, int)
-    {
-        return 1.0f;
-    }
-};
-template <typename Mag>
-__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
-    float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-{
-	const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < width && y < height)
-    {
-        float mag_data = Mag::get(mag, mag_step, x, y);
-        float angle_data = angle[y * angle_step + x];
-        float sin_a, cos_a;
-
-        ::sincosf(scale * angle_data, &sin_a, &cos_a);
-
-        xptr[y * x_step + x] = mag_data * cos_a;
-        yptr[y * y_step + x] = mag_data * sin_a;
-    }
-}
-
-template <typename Mag, typename Angle>
-void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(x.cols, threads.x);
-    grid.y = divUp(x.rows, threads.y);
-    
-    const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
-
-    cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-        x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
-        mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
-{
-    typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
-    static const caller_t callers[2][2][2] = 
-    {
+        struct Nothing
        {
+            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
            {
-                cartToPolar_caller<Magnitude, Atan2>,
-                cartToPolar_caller<Magnitude, Nothing>
-            },
-            {
-                cartToPolar_caller<MagnitudeSqr, Atan2>,
-                cartToPolar_caller<MagnitudeSqr, Nothing>,
            }
-        },
+        };
+        struct Magnitude
        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
            {
-                cartToPolar_caller<Nothing, Atan2>,
-                cartToPolar_caller<Nothing, Nothing>
-            },
+                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
+            }
+        };
+        struct MagnitudeSqr
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
            {
-                cartToPolar_caller<Nothing, Atan2>,
-                cartToPolar_caller<Nothing, Nothing>,
+                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
+            }
+        };
+        struct Atan2
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+            {
+                float angle = ::atan2f(y_data, x_data);
+                angle += (angle < 0) * 2.0 * CV_PI;
+                dst[y * dst_step + x] = scale * angle;
+            }
+        };
+        template <typename Mag, typename Angle>
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
+                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
+        {
+	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float x_data = xptr[y * x_step + x];
+                float y_data = yptr[y * y_step + x];
+
+                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
+                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
            }
        }
-    };

-    callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-}
+        struct NonEmptyMag
+        {
+            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
+            {
+                return mag[y * mag_step + x];
+            }
+        };
+        struct EmptyMag
+        {
+            static __device__ __forceinline__ float get(const float*, size_t, int, int)
+            {
+                return 1.0f;
+            }
+        };
+        template <typename Mag>
+        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
+        {
+	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	        const int y = blockDim.y * blockIdx.y + threadIdx.y;

-template <typename Mag>
-void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+            if (x < width && y < height)
+            {
+                float mag_data = Mag::get(mag, mag_step, x, y);
+                float angle_data = angle[y * angle_step + x];
+                float sin_a, cos_a;

-    grid.x = divUp(mag.cols, threads.x);
-    grid.y = divUp(mag.rows, threads.y);
-    
-    const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+                ::sincosf(scale * angle_data, &sin_a, &cos_a);

-    polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
-        angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
-    cudaSafeCall( cudaGetLastError() );
+                xptr[y * x_step + x] = mag_data * cos_a;
+                yptr[y * y_step + x] = mag_data * sin_a;
+            }
+        }

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+        template <typename Mag, typename Angle>
+        void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);

-void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
-{
-    typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
-    static const caller_t callers[2] = 
-    {
-        polarToCart_caller<NonEmptyMag>,
-        polarToCart_caller<EmptyMag>
-    };
+            grid.x = divUp(x.cols, threads.x);
+            grid.y = divUp(x.rows, threads.y);
+            
+            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;

-    callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-}
+            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
+                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+            cudaSafeCall( cudaGetLastError() );

-} // namespace mathfunc
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }

-END_OPENCV_DEVICE_NAMESPACE
+        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2][2][2] = 
+            {
+                {
+                    {
+                        cartToPolar_caller<Magnitude, Atan2>,
+                        cartToPolar_caller<Magnitude, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<MagnitudeSqr, Atan2>,
+                        cartToPolar_caller<MagnitudeSqr, Nothing>,
+                    }
+                },
+                {
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>,
+                    }
+                }
+            };
+
+            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
+        }
+
+        template <typename Mag>
+        void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(mag.cols, threads.x);
+            grid.y = divUp(mag.rows, threads.y);
+            
+            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
+                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2] = 
+            {
+                polarToCart_caller<NonEmptyMag>,
+                polarToCart_caller<EmptyMag>
+            };
+
+            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
+        }
+    } // namespace mathfunc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -45,304 +45,303 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-template <typename T> struct shift_and_sizeof;
-template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
-template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
-template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
-template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
-template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
-template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
-template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
-
-///////////////////////////////////////////////////////////////////////////
-////////////////////////////////// CopyTo /////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
-template<typename T>
-__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
+namespace cv { namespace gpu { namespace device 
 {
-    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+    template <typename T> struct shift_and_sizeof;
+    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
+    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };

-    if ((x < cols * channels ) && (y < rows))
-        if (mask[y * step_mask + x / channels] != 0)
-        {
-            size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
-            mat_dst[idx] = mat_src[idx];
-        }
-}
+    ///////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////// CopyTo /////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////

-template<typename T>
-void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
-{
-    dim3 threadsPerBlock(16,16, 1);
-    dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
-
-    copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
-            ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall ( cudaDeviceSynchronize() );
-}
-
-void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
-{
-    typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
-
-    static CopyToFunc tab[8] =
+    template<typename T>
+    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
    {
-        copy_to_with_mask_run<unsigned char>,
-        copy_to_with_mask_run<signed char>,
-        copy_to_with_mask_run<unsigned short>,
-        copy_to_with_mask_run<short>,
-        copy_to_with_mask_run<int>,
-        copy_to_with_mask_run<float>,
-        copy_to_with_mask_run<double>,
-        0
-    };
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;

-    CopyToFunc func = tab[depth];
-
-    if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
-
-    func(mat_src, mat_dst, mask, channels, stream);
-}
-
-///////////////////////////////////////////////////////////////////////////
-////////////////////////////////// SetTo //////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
-__constant__ uchar scalar_8u[4];
-__constant__ schar scalar_8s[4];
-__constant__ ushort scalar_16u[4];
-__constant__ short scalar_16s[4];
-__constant__ int scalar_32s[4];
-__constant__ float scalar_32f[4]; 
-__constant__ double scalar_64f[4];
-
-template <typename T> __device__ __forceinline__ T readScalar(int i);
-template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
-template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
-template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
-template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
-
-void writeScalar(const uchar* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
-}
-void writeScalar(const schar* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
-}
-void writeScalar(const ushort* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
-}
-void writeScalar(const short* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
-}
-void writeScalar(const int* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
-}
-void writeScalar(const float* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
-}
-void writeScalar(const double* vals)
-{
-    cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
-}
-
-template<typename T>
-__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
-{
-    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if ((x < cols * channels ) && (y < rows))
-    {
-        size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-        mat[idx] = readScalar<T>(x % channels);
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
+                mat_dst[idx] = mat_src[idx];
+            }
    }
-}

-template<typename T>
-__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
-{
-    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+    template<typename T>
+    void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
+    {
+        dim3 threadsPerBlock(16,16, 1);
+        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);

-    if ((x < cols * channels ) && (y < rows))
-        if (mask[y * step_mask + x / channels] != 0)
+        copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+                ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
+
+    void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
+    {
+        typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);
+
+        static CopyToFunc tab[8] =
+        {
+            copy_to_with_mask_run<unsigned char>,
+            copy_to_with_mask_run<signed char>,
+            copy_to_with_mask_run<unsigned short>,
+            copy_to_with_mask_run<short>,
+            copy_to_with_mask_run<int>,
+            copy_to_with_mask_run<float>,
+            copy_to_with_mask_run<double>,
+            0
+        };
+
+        CopyToFunc func = tab[depth];
+
+        if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
+
+        func(mat_src, mat_dst, mask, channels, stream);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    ////////////////////////////////// SetTo //////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
+
+    __constant__ uchar scalar_8u[4];
+    __constant__ schar scalar_8s[4];
+    __constant__ ushort scalar_16u[4];
+    __constant__ short scalar_16s[4];
+    __constant__ int scalar_32s[4];
+    __constant__ float scalar_32f[4]; 
+    __constant__ double scalar_64f[4];
+
+    template <typename T> __device__ __forceinline__ T readScalar(int i);
+    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
+    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
+    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
+    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
+    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
+    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
+    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
+
+    void writeScalar(const uchar* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
+    }
+    void writeScalar(const schar* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
+    }
+    void writeScalar(const ushort* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
+    }
+    void writeScalar(const short* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
+    }
+    void writeScalar(const int* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
+    }
+    void writeScalar(const float* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
+    }
+    void writeScalar(const double* vals)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
+    }
+
+    template<typename T>
+    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if ((x < cols * channels ) && (y < rows))
        {
            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
            mat[idx] = readScalar<T>(x % channels);
        }
-}
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
-{
-    writeScalar(scalar);
-
-    dim3 threadsPerBlock(32, 8, 1);
-    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
-    set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall ( cudaDeviceSynchronize() );
-}
-
-template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-
-template <typename T>
-void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
-{
-    writeScalar(scalar);
-
-    dim3 threadsPerBlock(32, 8, 1);
-    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-
-    set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall ( cudaDeviceSynchronize() );
-}
-
-template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
-template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
-
-///////////////////////////////////////////////////////////////////////////
-//////////////////////////////// ConvertTo ////////////////////////////////
-///////////////////////////////////////////////////////////////////////////
-
-template <typename T, typename D> struct Convertor : unary_function<T, D>
-{
-    Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
-
-    __device__ __forceinline__ D operator()(const T& src) const
-    {
-        return saturate_cast<D>(alpha * src + beta);
    }

-    const double alpha, beta;
-};
+    template<typename T>
+    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
+    {
+        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+        size_t y = blockIdx.y * blockDim.y + threadIdx.y;

-namespace detail
-{
-    template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
+        if ((x < cols * channels ) && (y < rows))
+            if (mask[y * step_mask + x / channels] != 0)
+            {
+                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
+                mat[idx] = readScalar<T>(x % channels);
+            }
+    }
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
    {
-    };
-    template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
+        writeScalar(scalar);
+
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+
+        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
+
+    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+
+    template <typename T>
+    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
    {
-        enum { smart_shift = 8 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
+        writeScalar(scalar);
+
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+
+        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall ( cudaDeviceSynchronize() );
+    }
+
+    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
+
+    ///////////////////////////////////////////////////////////////////////////
+    //////////////////////////////// ConvertTo ////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////
+
+    template <typename T, typename D> struct Convertor : unary_function<T, D>
    {
-        enum { smart_shift = 4 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+
+        __device__ __forceinline__ D operator()(const T& src) const
+        {
+            return saturate_cast<D>(alpha * src + beta);
+        }
+
+        const double alpha, beta;
    };

-    template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+    namespace detail
    {
-        enum { smart_shift = 4 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_shift = 2 };
-    };
+        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
+        {
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 8 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };

-    template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 2 };
-    };
+        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_shift = 2 };
+        };

-    template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 4 };
+        };
+        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
+        {
+            enum { smart_block_dim_y = 8 };
+            enum { smart_shift = 2 };
+        };
+
+        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+        {
+        };
+    }
+
+    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
    {
    };
-}
-
-template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
-{
-};
-    
-template<typename T, typename D>
-void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
-{
-    cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-    cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-    Convertor<T, D> op(alpha, beta);
-    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
-}
-
-void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, 
-    cudaStream_t stream = 0)
-{
-    typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, 
-        cudaStream_t stream);
-
-    static const caller_t tab[8][8] =
+        
+    template<typename T, typename D>
+    void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
    {
-        {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-        cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
+        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
+        Convertor<T, D> op(alpha, beta);
+        ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
+    }

-        {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-        cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
+    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, 
+        cudaStream_t stream = 0)
+    {
+        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, 
+            cudaStream_t stream);

-        {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-        cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
+        static const caller_t tab[8][8] =
+        {
+            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},

-        {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-        cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},

-        {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-        cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},

-        {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-        cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},

-        {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-        cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},

-        {0,0,0,0,0,0,0,0}
-    };
+            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},

-    caller_t func = tab[sdepth][ddepth];
-    if (!func)
-        cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},

-    func(src, dst, alpha, beta, stream);
-}
+            {0,0,0,0,0,0,0,0}
+        };

-END_OPENCV_DEVICE_NAMESPACE
+        caller_t func = tab[sdepth][ddepth];
+        if (!func)
+            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+
+        func(src, dst, alpha, beta, stream);
+    }
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -46,142 +46,140 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
+namespace cv { namespace gpu { namespace device 
 {
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y;
-
-    __shared__ value_type smem[256 + 4];
-
-    value_type sum;
-    
-    const int src_y = 2*y;
-
-    sum = VecTraits<value_type>::all(0);
-    
-    sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
-    sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
-    sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
-    sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
-    sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
-
-    smem[2 + threadIdx.x] = sum;
-
-    if (threadIdx.x < 2)
+    namespace imgproc 
    {
-        const int left_x = x - 2 + threadIdx.x;
+        template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;

-        sum = VecTraits<value_type>::all(0);
-    
-        sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
-        sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
-        sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;

-        smem[threadIdx.x] = sum;
-    }
+            __shared__ value_type smem[256 + 4];

-    if (threadIdx.x > 253)
-    {
-        const int right_x = x + threadIdx.x + 2;
+            value_type sum;
+            
+            const int src_y = 2*y;

-        sum = VecTraits<value_type>::all(0);
-    
-        sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
-        sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
-        sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
+            sum = VecTraits<value_type>::all(0);
+            
+            sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
+            sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
+            sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
+            sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);

-        smem[4 + threadIdx.x] = sum;
-    }
+            smem[2 + threadIdx.x] = sum;

-    __syncthreads();
+            if (threadIdx.x < 2)
+            {
+                const int left_x = x - 2 + threadIdx.x;

-    if (threadIdx.x < 128)
-    {
-        const int tid2 = threadIdx.x * 2;
+                sum = VecTraits<value_type>::all(0);
+            
+                sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
+                sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
+                sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);

-        sum = VecTraits<value_type>::all(0);
+                smem[threadIdx.x] = sum;
+            }

-        sum = sum + 0.0625f * smem[2 + tid2 - 2];
-        sum = sum + 0.25f   * smem[2 + tid2 - 1];
-        sum = sum + 0.375f  * smem[2 + tid2    ];
-        sum = sum + 0.25f   * smem[2 + tid2 + 1];
-        sum = sum + 0.0625f * smem[2 + tid2 + 2];
+            if (threadIdx.x > 253)
+            {
+                const int right_x = x + threadIdx.x + 2;

-        const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+                sum = VecTraits<value_type>::all(0);
+            
+                sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
+                sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
+                sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
+                sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);

-        if (dst_x < dst_cols)
-            dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-    }
-}
+                smem[4 + threadIdx.x] = sum;
+            }

-template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
-    const dim3 block(256);
-    const dim3 grid(divUp(src.cols, block.x), dst.rows);
+            __syncthreads();

-    B<T> b(src.rows, src.cols);
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;

-    pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-    cudaSafeCall( cudaGetLastError() );
+                sum = VecTraits<value_type>::all(0);

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+                sum = sum + 0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];

-template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
-{
-    typedef typename TypeVec<T, cn>::vec_type type;
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;

-    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }

-    static const caller_t callers[] = 
-    {
-        pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
-    };
+        template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);

-    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-}
+            B<T> b(src.rows, src.cols);

-template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );

-template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }

-template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type type;

-template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);

-template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            static const caller_t callers[] = 
+            {
+                pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
+            };

-template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+        }

-} // namespace imgproc
+        template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

-END_OPENCV_DEVICE_NAMESPACE
+        template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -46,137 +46,135 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-
-template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
+namespace cv { namespace gpu { namespace device 
 {
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    __shared__ T smem1[10][10];
-    __shared__ value_type smem2[20][16];
-
-    value_type sum;
-
-    if (threadIdx.x < 10 && threadIdx.y < 10)
-        smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
-
-    __syncthreads();
-
-    const int tidx = threadIdx.x;
-
-    sum = VecTraits<value_type>::all(0);
-
-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
-    sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
-
-    smem2[2 + threadIdx.y][tidx] = sum;
-
-    if (threadIdx.y < 2)
+    namespace imgproc 
    {
-        sum = VecTraits<value_type>::all(0);
+        template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;

-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-        smem2[threadIdx.y][tidx] = sum;
-    }
+            __shared__ T smem1[10][10];
+            __shared__ value_type smem2[20][16];

-    if (threadIdx.y > 13)
-    {
-        sum = VecTraits<value_type>::all(0);
+            value_type sum;

-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+                smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);

-        smem2[4 + threadIdx.y][tidx] = sum;
-    }
+            __syncthreads();

-    __syncthreads();
+            const int tidx = threadIdx.x;

-    sum = VecTraits<value_type>::all(0);
+            sum = VecTraits<value_type>::all(0);

-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
-    sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
-    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
-    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];

-    if (x < dst.cols && y < dst.rows)
-        dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
-}
+            smem2[2 + threadIdx.y][tidx] = sum;

-template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
-{
-    const dim3 block(16, 16);
-    const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<value_type>::all(0);

-    B<T> b(src.rows, src.cols);
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];

-    pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
-    cudaSafeCall( cudaGetLastError() );
+                smem2[threadIdx.y][tidx] = sum;
+            }

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<value_type>::all(0);

-template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
-{
-    typedef typename TypeVec<T, cn>::vec_type type;
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
+                sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
+                sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];

-    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+                smem2[4 + threadIdx.y][tidx] = sum;
+            }

-    static const caller_t callers[] = 
-    {
-        pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
-    };
+            __syncthreads();

-    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-}
+            sum = VecTraits<value_type>::all(0);

-template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
+            sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
+            sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
+            sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];

-template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
+        }

-template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            B<T> b(src.rows, src.cols);

-template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
+            cudaSafeCall( cudaGetLastError() );

-template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }

-} // namespace imgproc
+        template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type type;

-END_OPENCV_DEVICE_NAMESPACE
+            typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+
+            static const caller_t callers[] = 
+            {
+                pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
+            };
+
+            callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+        }
+
+        template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+        template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+        template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -47,208 +47,206 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-    
-template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
+namespace cv { namespace gpu { namespace device 
 {
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    namespace imgproc 
+    {    
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;

-    if (x < dst.cols && y < dst.rows)
-    {
-        const float xcoo = mapx.ptr(y)[x];
-        const float ycoo = mapy.ptr(y)[x];
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];

-        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-    }
-}
-
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
-{
-    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
-        const float* borderValue, cudaStream_t stream, int)
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-        
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-        remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-        cudaSafeCall( cudaGetLastError() );
-    }
-};
-
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-{
-    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
-        
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-        remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
-
-#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-    texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-    struct tex_remap_ ## type ## _reader \
-    { \
-        typedef type elem_type; \
-        typedef int index_type; \
-        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-        { \
-            return tex2D(tex_remap_ ## type , x, y); \
-        } \
-    }; \
-    template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
-        { \
-            typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-            dim3 block(32, cc >= 20 ? 8 : 4); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_remap_ ## type , src); \
-            tex_remap_ ## type ##_reader texSrc; \
-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-            BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-            Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    }; \
-    template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_remap_ ## type , src); \
-            tex_remap_ ## type ##_reader texSrc; \
-            Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    };
-    
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-
-#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-{ 
-    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
-        const float* borderValue, cudaStream_t stream, int cc)
-    {
-        if (stream == 0)
-            RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
-        else
-            RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
-    }
-};
-
-template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
-    int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, 
-        const float* borderValue, cudaStream_t stream, int cc);
-
-    static const caller_t callers[3][5] = 
-    {
-        { 
-            RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
-            RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
-            RemapDispatcher<PointFilter, BrdConstant, T>::call, 
-            RemapDispatcher<PointFilter, BrdReflect, T>::call, 
-            RemapDispatcher<PointFilter, BrdWrap, T>::call 
-        },
-        { 
-            RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
-            RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
-            RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
-            RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
-            RemapDispatcher<LinearFilter, BrdWrap, T>::call 
-        },
-        { 
-            RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
-            RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
-            RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
-            RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
-            RemapDispatcher<CubicFilter, BrdWrap, T>::call 
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
        }
-    };

-    callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
-}
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
+                const float* borderValue, cudaStream_t stream, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);

-//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };

-template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+                
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);

-//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );

-template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };

-} // namespace imgproc
+        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x, y); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float* borderValue, int cc) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , src); \
+                    tex_remap_ ## type ##_reader texSrc; \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_< type >& dst, const float*, int) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , src); \
+                    tex_remap_ ## type ##_reader texSrc; \
+                    Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+            
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)

-END_OPENCV_DEVICE_NAMESPACE
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+        { 
+            static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
+                const float* borderValue, cudaStream_t stream, int cc)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+            }
+        };
+
+        template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
+            int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, 
+                const float* borderValue, cudaStream_t stream, int cc);
+
+            static const caller_t callers[3][5] = 
+            {
+                { 
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call, 
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call, 
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call 
+                },
+                { 
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call 
+                },
+                { 
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call 
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
+        }
+
+        template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -47,219 +47,217 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace imgproc {
-    
-template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+namespace cv { namespace gpu { namespace device 
 {
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    namespace imgproc 
+    {    
+        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;

-    if (x < dst.cols && y < dst.rows)
-    {
-        const float xcoo = x / fx;
-        const float ycoo = y / fy;
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x / fx;
+                const float ycoo = y / fy;

-        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-    }
-}
-template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
-{
-    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+        template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;

-    if (x < dst.cols && y < dst.rows)
-    {
-        const float xcoo = x / fx;
-        const float ycoo = y / fy;
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x / fx;
+                const float ycoo = y / fy;

-        dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
-    }
-}
+                dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
+            }
+        }

-template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-{
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);

-        resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
-    }
-};
-template <typename T> struct ResizeDispatcherStream<PointFilter, T>
-{
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+        template <typename T> struct ResizeDispatcherStream<PointFilter, T>
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);

-        resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
-    }
-};
+                resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };

-template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-{
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);

-        resize<<<grid, block>>>(filter_src, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
+                resize<<<grid, block>>>(filter_src, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
-template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
-{
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-    {            
-        dim3 block(32, 8);
-        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
+        {
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
+            {            
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);

-        resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
-        cudaSafeCall( cudaGetLastError() );
+                resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-};
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };

-#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-    texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-    struct tex_resize_ ## type ## _reader \
-    { \
-        typedef type elem_type; \
-        typedef int index_type; \
-        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-        { \
-            return tex2D(tex_resize_ ## type , x, y); \
-        } \
-    }; \
-    template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_resize_ ## type , src); \
-            tex_resize_ ## type ##_reader texSrc; \
-            Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
-            resize<<<grid, block>>>(filter_src, fx, fy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    }; \
-    template <> struct ResizeDispatcherNonStream<PointFilter, type> \
-    { \
-        static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
-        { \
-            dim3 block(32, 8); \
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            bindTexture(&tex_resize_ ## type , src); \
-            tex_resize_ ## type ##_reader texSrc; \
-            resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
-            cudaSafeCall( cudaGetLastError() ); \
-            cudaSafeCall( cudaDeviceSynchronize() ); \
-        } \
-    };
-    
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_resize_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_resize_ ## type , x, y); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type , src); \
+                    tex_resize_ ## type ##_reader texSrc; \
+                    Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
+                    resize<<<grid, block>>>(filter_src, fx, fy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <> struct ResizeDispatcherNonStream<PointFilter, type> \
+            { \
+                static void call(const DevMem2D_< type >& src, float fx, float fy, const DevMem2D_< type >& dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type , src); \
+                    tex_resize_ ## type ##_reader texSrc; \
+                    resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+            
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)

-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)

-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)

-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)

-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)

-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
-OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)

-#undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX

-template <template <typename> class Filter, typename T> struct ResizeDispatcher
-{ 
-    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-    {
-        if (stream == 0)
-            ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
-        else
-            ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-    }
-};
+        template <template <typename> class Filter, typename T> struct ResizeDispatcher
+        { 
+            static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
+            {
+                if (stream == 0)
+                    ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
+                else
+                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+            }
+        };

-template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
+        template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);

-    static const caller_t callers[3] = 
-    {
-        ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
-    };
+            static const caller_t callers[3] = 
+            {
+                ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
+            };

-    callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
-}
+            callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
+        }

-template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-
-} // namespace imgproc
-
-END_OPENCV_DEVICE_NAMESPACE
+        template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -47,226 +47,224 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-#define MAX_KERNEL_SIZE 16
-#define BLOCK_DIM_X 16
-#define BLOCK_DIM_Y 4
-#define RESULT_STEPS 8
-#define HALO_STEPS 1
-
-namespace row_filter {
-
-__constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-void loadKernel(const float kernel[], int ksize)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
-}
+    #define MAX_KERNEL_SIZE 16
+    #define BLOCK_DIM_X 16
+    #define BLOCK_DIM_Y 4
+    #define RESULT_STEPS 8
+    #define HALO_STEPS 1

-namespace detail
-{
-    template <typename T, size_t size> struct SmemType
+    namespace row_filter 
    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
-    };
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];

-    template <typename T> struct SmemType<T, 4>
-    {
-        typedef T smem_t;
-    };
-}
-
-template <typename T> struct SmemType
-{
-    typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
-};
-
-template <int KERNEL_SIZE, typename T, typename D, typename B>
-__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
-{
-    typedef typename SmemType<T>::smem_t smem_t;
-    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-    __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
-
-    //Offset to the left halo edge
-    const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
-    const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-    if (y < src.rows)
-    {
-        const T* src_row = src.ptr(y);
-
-        //Load main data
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
-
-        //Load left halo
-        #pragma unroll
-        for(int i = 0; i < HALO_STEPS; ++i)
-            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
-
-        //Load right halo
-        #pragma unroll
-        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
-            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
-
-        __syncthreads();
-
-        D* dst_row = dst.ptr(y);
-
-        #pragma unroll
-        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+        void loadKernel(const float kernel[], int ksize)
        {
-            sum_t sum = VecTraits<sum_t>::all(0);
-
-            #pragma unroll
-            for (int j = 0; j < KERNEL_SIZE; ++j)
-                sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
-
-            int dstX = x + i * BLOCK_DIM_X;
-
-            if (dstX < src.cols)
-                dst_row[dstX] = saturate_cast<D>(sum);
+            cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
        }
-    }
-}

-template <int ksize, typename T, typename D, template<typename> class B>
-void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
-{
-    typedef typename SmemType<T>::smem_t smem_t;
-
-    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-    const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
-
-    B<smem_t> b(src.cols);
-
-    linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template <typename T, typename D>
-void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-{
-    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-    static const caller_t callers[5][17] = 
-    {
+        namespace detail
        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
-            linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
-            linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
-            linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-            linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
-            linearRowFilter_caller<16, T, D, BrdRowReflect101>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
-            linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
-            linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-            linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
-            linearRowFilter_caller<16, T, D, BrdRowReplicate>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<2 , T, D, BrdRowConstant>,
-            linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
-            linearRowFilter_caller<8 , T, D, BrdRowConstant>,
-            linearRowFilter_caller<9 , T, D, BrdRowConstant>,
-            linearRowFilter_caller<10, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<11, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<12, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<13, T, D, BrdRowConstant>,
-            linearRowFilter_caller<14, T, D, BrdRowConstant>,
-            linearRowFilter_caller<15, T, D, BrdRowConstant>, 
-            linearRowFilter_caller<16, T, D, BrdRowConstant>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<2 , T, D, BrdRowReflect>,
-            linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
-            linearRowFilter_caller<8 , T, D, BrdRowReflect>,
-            linearRowFilter_caller<9 , T, D, BrdRowReflect>,
-            linearRowFilter_caller<10, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<11, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<12, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<13, T, D, BrdRowReflect>,
-            linearRowFilter_caller<14, T, D, BrdRowReflect>,
-            linearRowFilter_caller<15, T, D, BrdRowReflect>, 
-            linearRowFilter_caller<16, T, D, BrdRowReflect>
-        },
-        {
-            0, 
-            linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<2 , T, D, BrdRowWrap>,
-            linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
-            linearRowFilter_caller<8 , T, D, BrdRowWrap>,
-            linearRowFilter_caller<9 , T, D, BrdRowWrap>,
-            linearRowFilter_caller<10, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<11, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<12, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<13, T, D, BrdRowWrap>,
-            linearRowFilter_caller<14, T, D, BrdRowWrap>,
-            linearRowFilter_caller<15, T, D, BrdRowWrap>, 
-            linearRowFilter_caller<16, T, D, BrdRowWrap>
+            template <typename T, size_t size> struct SmemType
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
+            };
+
+            template <typename T> struct SmemType<T, 4>
+            {
+                typedef T smem_t;
+            };
        }
-    };
-    
-    loadKernel(kernel, ksize);

-    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-}
+        template <typename T> struct SmemType
+        {
+            typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
+        };

-template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template <int KERNEL_SIZE, typename T, typename D, typename B>
+        __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+        {
+            typedef typename SmemType<T>::smem_t smem_t;
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

-} // namespace row_filter
+            __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];

-END_OPENCV_DEVICE_NAMESPACE
+            //Offset to the left halo edge
+            const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
+            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+            if (y < src.rows)
+            {
+                const T* src_row = src.ptr(y);
+
+                //Load main data
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                    smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
+
+                //Load left halo
+                #pragma unroll
+                for(int i = 0; i < HALO_STEPS; ++i)
+                    smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
+
+                //Load right halo
+                #pragma unroll
+                for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
+                    smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
+
+                __syncthreads();
+
+                D* dst_row = dst.ptr(y);
+
+                #pragma unroll
+                for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
+
+                    #pragma unroll
+                    for (int j = 0; j < KERNEL_SIZE; ++j)
+                        sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
+
+                    int dstX = x + i * BLOCK_DIM_X;
+
+                    if (dstX < src.cols)
+                        dst_row[dstX] = saturate_cast<D>(sum);
+                }
+            }
+        }
+
+        template <int ksize, typename T, typename D, template<typename> class B>
+        void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+        {
+            typedef typename SmemType<T>::smem_t smem_t;
+
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+
+            B<smem_t> b(src.cols);
+
+            linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T, typename D>
+        void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+            static const caller_t callers[5][17] = 
+            {
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
+                    linearRowFilter_caller<16, T, D, BrdRowReflect101>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
+                    linearRowFilter_caller<16, T, D, BrdRowReplicate>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowConstant>,
+                    linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowConstant>,
+                    linearRowFilter_caller<9 , T, D, BrdRowConstant>,
+                    linearRowFilter_caller<10, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<11, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<12, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<15, T, D, BrdRowConstant>, 
+                    linearRowFilter_caller<16, T, D, BrdRowConstant>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowReflect>,
+                    linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowReflect>,
+                    linearRowFilter_caller<9 , T, D, BrdRowReflect>,
+                    linearRowFilter_caller<10, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<11, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<12, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect>, 
+                    linearRowFilter_caller<16, T, D, BrdRowReflect>
+                },
+                {
+                    0, 
+                    linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<2 , T, D, BrdRowWrap>,
+                    linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<8 , T, D, BrdRowWrap>,
+                    linearRowFilter_caller<9 , T, D, BrdRowWrap>,
+                    linearRowFilter_caller<10, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<11, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<12, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<15, T, D, BrdRowWrap>, 
+                    linearRowFilter_caller<16, T, D, BrdRowWrap>
+                }
+            };
+            
+            loadKernel(kernel, ksize);
+
+            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+        }
+
+        template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        //template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+        template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+    } // namespace row_filter
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -62,44 +62,43 @@
    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
 #endif

-namespace cv { namespace gpu {
-
-void error(const char *error_string, const char *file, const int line, const char *func = "");
-void nppError(int err, const char *file, const int line, const char *func = "");
-void ncvError(int err, const char *file, const int line, const char *func = "");
-void cufftError(int err, const char *file, const int line, const char *func = "");
-void cublasError(int err, const char *file, const int line, const char *func = "");
-
-static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+namespace cv { namespace gpu 
 {
-    if (cudaSuccess != err)
-        cv::gpu::error(cudaGetErrorString(err), file, line, func);
-}
+    void error(const char *error_string, const char *file, const int line, const char *func = "");
+    void nppError(int err, const char *file, const int line, const char *func = "");
+    void ncvError(int err, const char *file, const int line, const char *func = "");
+    void cufftError(int err, const char *file, const int line, const char *func = "");
+    void cublasError(int err, const char *file, const int line, const char *func = "");

-static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (err < 0)
-        cv::gpu::nppError(err, file, line, func);
-}
+    static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+    {
+        if (cudaSuccess != err)
+            cv::gpu::error(cudaGetErrorString(err), file, line, func);
+    }

-static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (NCV_SUCCESS != err)
-        cv::gpu::ncvError(err, file, line, func);
-}
+    static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (err < 0)
+            cv::gpu::nppError(err, file, line, func);
+    }

-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUFFT_SUCCESS != err)
-        cv::gpu::cufftError(err, file, line, func);
-}
+    static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+    {
+        if (NCV_SUCCESS != err)
+            cv::gpu::ncvError(err, file, line, func);
+    }

-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUBLAS_STATUS_SUCCESS != err)
-        cv::gpu::cublasError(err, file, line, func);
-}
+    static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+    {
+        if (CUFFT_SUCCESS != err)
+            cv::gpu::cufftError(err, file, line, func);
+    }

+    static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
+    {
+        if (CUBLAS_STATUS_SUCCESS != err)
+            cv::gpu::cublasError(err, file, line, func);
+    }
 }}

 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -42,467 +42,465 @@

 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace split_merge {
-
-template <typename T, size_t elem_size = sizeof(T)>
-struct TypeTraits 
+namespace cv { namespace gpu { namespace device 
 {
-    typedef T type;
-    typedef T type2;
-    typedef T type3;
-    typedef T type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 1>
-{
-    typedef char type;
-    typedef char2 type2;
-    typedef char3 type3;
-    typedef char4 type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 2>
-{
-    typedef short type;
-    typedef short2 type2;
-    typedef short3 type3;
-    typedef short4 type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 4> 
-{
-    typedef int type;
-    typedef int2 type2;
-    typedef int3 type3;
-    typedef int4 type4;
-};
-
-template <typename T>
-struct TypeTraits<T, 8> 
-{
-    typedef double type;
-    typedef double2 type2;
-    //typedef double3 type3;
-    //typedef double4 type3;
-};
-
-typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
-typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
-
-//------------------------------------------------------------
-// Merge    
-
-template <typename T>
-__global__ void mergeC2_(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    typedef typename TypeTraits<T>::type2 dst_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const T* src0_y = (const T*)(src0 + y * src0_step);
-    const T* src1_y = (const T*)(src1 + y * src1_step);
-    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_type dst_elem;
-        dst_elem.x = src0_y[x];
-        dst_elem.y = src1_y[x];
-        dst_y[x] = dst_elem;
-    }
-}
-
-
-template <typename T>
-__global__ void mergeC3_(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    typedef typename TypeTraits<T>::type3 dst_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const T* src0_y = (const T*)(src0 + y * src0_step);
-    const T* src1_y = (const T*)(src1 + y * src1_step);
-    const T* src2_y = (const T*)(src2 + y * src2_step);
-    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_type dst_elem;
-        dst_elem.x = src0_y[x];
-        dst_elem.y = src1_y[x];
-        dst_elem.z = src2_y[x];
-        dst_y[x] = dst_elem;
-    }
-}
-
-
-template <>
-__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const double* src0_y = (const double*)(src0 + y * src0_step);
-    const double* src1_y = (const double*)(src1 + y * src1_step);
-    const double* src2_y = (const double*)(src2 + y * src2_step);
-    double* dst_y = (double*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_y[3 * x] = src0_y[x];
-        dst_y[3 * x + 1] = src1_y[x];
-        dst_y[3 * x + 2] = src2_y[x];
-    }
-}
-
-
-template <typename T>
-__global__ void mergeC4_(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         const uchar* src3, size_t src3_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    typedef typename TypeTraits<T>::type4 dst_type;
-
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const T* src0_y = (const T*)(src0 + y * src0_step);
-    const T* src1_y = (const T*)(src1 + y * src1_step);
-    const T* src2_y = (const T*)(src2 + y * src2_step);
-    const T* src3_y = (const T*)(src3 + y * src3_step);
-    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_type dst_elem;
-        dst_elem.x = src0_y[x];
-        dst_elem.y = src1_y[x];
-        dst_elem.z = src2_y[x];
-        dst_elem.w = src3_y[x];
-        dst_y[x] = dst_elem;
-    }
-}
-
-
-template <>
-__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
-                         const uchar* src1, size_t src1_step, 
-                         const uchar* src2, size_t src2_step, 
-                         const uchar* src3, size_t src3_step, 
-                         int rows, int cols, uchar* dst, size_t dst_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    const double* src0_y = (const double*)(src0 + y * src0_step);
-    const double* src1_y = (const double*)(src1 + y * src1_step);
-    const double* src2_y = (const double*)(src2 + y * src2_step);
-    const double* src3_y = (const double*)(src3 + y * src3_step);
-    double2* dst_y = (double2*)(dst + y * dst_step);
-
-    if (x < cols && y < rows) 
-    {                        
-        dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
-        dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
-    }
-}
-
-
-template <typename T>
-static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-    mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
-            src[0].data, src[0].step,
-            src[1].data, src[1].step,
-            dst.rows, dst.cols, dst.data, dst.step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-template <typename T>
-static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-    mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
-            src[0].data, src[0].step,
-            src[1].data, src[1].step,
-            src[2].data, src[2].step,
-            dst.rows, dst.cols, dst.data, dst.step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-template <typename T>
-static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-    mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
-            src[0].data, src[0].step,
-            src[1].data, src[1].step,
-            src[2].data, src[2].step,
-            src[3].data, src[3].step,
-            dst.rows, dst.cols, dst.data, dst.step);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
-
-
-void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
-                             int total_channels, size_t elem_size,
-                             const cudaStream_t& stream)
-{
-    static MergeFunction merge_func_tbl[] =
+    namespace split_merge 
    {
-        mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
-        mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
-        mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
-    };
+        template <typename T, size_t elem_size = sizeof(T)>
+        struct TypeTraits 
+        {
+            typedef T type;
+            typedef T type2;
+            typedef T type3;
+            typedef T type4;
+        };

-    size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
-    MergeFunction merge_func = merge_func_tbl[merge_func_id];
+        template <typename T>
+        struct TypeTraits<T, 1>
+        {
+            typedef char type;
+            typedef char2 type2;
+            typedef char3 type3;
+            typedef char4 type4;
+        };

-    if (merge_func == 0)
-        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+        template <typename T>
+        struct TypeTraits<T, 2>
+        {
+            typedef short type;
+            typedef short2 type2;
+            typedef short3 type3;
+            typedef short4 type4;
+        };

-    merge_func(src, dst, stream);
-}
+        template <typename T>
+        struct TypeTraits<T, 4> 
+        {
+            typedef int type;
+            typedef int2 type2;
+            typedef int3 type3;
+            typedef int4 type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 8> 
+        {
+            typedef double type;
+            typedef double2 type2;
+            //typedef double3 type3;
+            //typedef double4 type3;
+        };
+
+        typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
+        typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
+
+        //------------------------------------------------------------
+        // Merge    
+
+        template <typename T>
+        __global__ void mergeC2_(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type2 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <typename T>
+        __global__ void mergeC3_(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type3 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            const T* src2_y = (const T*)(src2 + y * src2_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_elem.z = src2_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <>
+        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src0_y = (const double*)(src0 + y * src0_step);
+            const double* src1_y = (const double*)(src1 + y * src1_step);
+            const double* src2_y = (const double*)(src2 + y * src2_step);
+            double* dst_y = (double*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_y[3 * x] = src0_y[x];
+                dst_y[3 * x + 1] = src1_y[x];
+                dst_y[3 * x + 2] = src2_y[x];
+            }
+        }
+
+
+        template <typename T>
+        __global__ void mergeC4_(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 const uchar* src3, size_t src3_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type4 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            const T* src2_y = (const T*)(src2 + y * src2_step);
+            const T* src3_y = (const T*)(src3 + y * src3_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_elem.z = src2_y[x];
+                dst_elem.w = src3_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <>
+        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
+                                 const uchar* src1, size_t src1_step, 
+                                 const uchar* src2, size_t src2_step, 
+                                 const uchar* src3, size_t src3_step, 
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src0_y = (const double*)(src0 + y * src0_step);
+            const double* src1_y = (const double*)(src1 + y * src1_step);
+            const double* src2_y = (const double*)(src2 + y * src2_step);
+            const double* src3_y = (const double*)(src3 + y * src3_step);
+            double2* dst_y = (double2*)(dst + y * dst_step);
+
+            if (x < cols && y < rows) 
+            {                        
+                dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
+                dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
+            }
+        }
+
+
+        template <typename T>
+        static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    src[2].data, src[2].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    src[2].data, src[2].step,
+                    src[3].data, src[3].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
+                                     int total_channels, size_t elem_size,
+                                     const cudaStream_t& stream)
+        {
+            static MergeFunction merge_func_tbl[] =
+            {
+                mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
+                mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
+                mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
+            };
+
+            size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
+            MergeFunction merge_func = merge_func_tbl[merge_func_id];
+
+            if (merge_func == 0)
+                cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+
+            merge_func(src, dst, stream);
+        }



-//------------------------------------------------------------
-// Split
+        //------------------------------------------------------------
+        // Split


-template <typename T>
-__global__ void splitC2_(const uchar* src, size_t src_step, 
-                        int rows, int cols,
-                        uchar* dst0, size_t dst0_step,
-                        uchar* dst1, size_t dst1_step)
-{
-    typedef typename TypeTraits<T>::type2 src_type;
+        template <typename T>
+        __global__ void splitC2_(const uchar* src, size_t src_step, 
+                                int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step)
+        {
+            typedef typename TypeTraits<T>::type2 src_type;

-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-    const src_type* src_y = (const src_type*)(src + y * src_step);
-    T* dst0_y = (T*)(dst0 + y * dst0_step);
-    T* dst1_y = (T*)(dst1 + y * dst1_step);
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);

-    if (x < cols && y < rows) 
-    {
-        src_type src_elem = src_y[x];
-        dst0_y[x] = src_elem.x;
-        dst1_y[x] = src_elem.y;
-    }
-}
+            if (x < cols && y < rows) 
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+            }
+        }


-template <typename T>
-__global__ void splitC3_(const uchar* src, size_t src_step, 
-                        int rows, int cols,
-                        uchar* dst0, size_t dst0_step,
-                        uchar* dst1, size_t dst1_step,
-                        uchar* dst2, size_t dst2_step)
-{
-    typedef typename TypeTraits<T>::type3 src_type;
+        template <typename T>
+        __global__ void splitC3_(const uchar* src, size_t src_step, 
+                                int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step,
+                                uchar* dst2, size_t dst2_step)
+        {
+            typedef typename TypeTraits<T>::type3 src_type;

-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-    const src_type* src_y = (const src_type*)(src + y * src_step);
-    T* dst0_y = (T*)(dst0 + y * dst0_step);
-    T* dst1_y = (T*)(dst1 + y * dst1_step);
-    T* dst2_y = (T*)(dst2 + y * dst2_step);
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+            T* dst2_y = (T*)(dst2 + y * dst2_step);

-    if (x < cols && y < rows) 
-    {
-        src_type src_elem = src_y[x];
-        dst0_y[x] = src_elem.x;
-        dst1_y[x] = src_elem.y;
-        dst2_y[x] = src_elem.z;
-    }
-}
+            if (x < cols && y < rows) 
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+                dst2_y[x] = src_elem.z;
+            }
+        }


-template <>
-__global__ void splitC3_<double>(
-        const uchar* src, size_t src_step, int rows, int cols,
-        uchar* dst0, size_t dst0_step,
-        uchar* dst1, size_t dst1_step,
-        uchar* dst2, size_t dst2_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        template <>
+        __global__ void splitC3_<double>(
+                const uchar* src, size_t src_step, int rows, int cols,
+                uchar* dst0, size_t dst0_step,
+                uchar* dst1, size_t dst1_step,
+                uchar* dst2, size_t dst2_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-    const double* src_y = (const double*)(src + y * src_step);
-    double* dst0_y = (double*)(dst0 + y * dst0_step);
-    double* dst1_y = (double*)(dst1 + y * dst1_step);
-    double* dst2_y = (double*)(dst2 + y * dst2_step);
+            const double* src_y = (const double*)(src + y * src_step);
+            double* dst0_y = (double*)(dst0 + y * dst0_step);
+            double* dst1_y = (double*)(dst1 + y * dst1_step);
+            double* dst2_y = (double*)(dst2 + y * dst2_step);

-    if (x < cols && y < rows) 
-    {
-        dst0_y[x] = src_y[3 * x];
-        dst1_y[x] = src_y[3 * x + 1];
-        dst2_y[x] = src_y[3 * x + 2];
-    }
-}
+            if (x < cols && y < rows) 
+            {
+                dst0_y[x] = src_y[3 * x];
+                dst1_y[x] = src_y[3 * x + 1];
+                dst2_y[x] = src_y[3 * x + 2];
+            }
+        }


-template <typename T>
-__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
-                        uchar* dst0, size_t dst0_step,
-                        uchar* dst1, size_t dst1_step,
-                        uchar* dst2, size_t dst2_step,
-                        uchar* dst3, size_t dst3_step)
-{
-    typedef typename TypeTraits<T>::type4 src_type;
+        template <typename T>
+        __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step,
+                                uchar* dst2, size_t dst2_step,
+                                uchar* dst3, size_t dst3_step)
+        {
+            typedef typename TypeTraits<T>::type4 src_type;

-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-    const src_type* src_y = (const src_type*)(src + y * src_step);
-    T* dst0_y = (T*)(dst0 + y * dst0_step);
-    T* dst1_y = (T*)(dst1 + y * dst1_step);
-    T* dst2_y = (T*)(dst2 + y * dst2_step);
-    T* dst3_y = (T*)(dst3 + y * dst3_step);
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+            T* dst2_y = (T*)(dst2 + y * dst2_step);
+            T* dst3_y = (T*)(dst3 + y * dst3_step);

-    if (x < cols && y < rows) 
-    {
-        src_type src_elem = src_y[x];
-        dst0_y[x] = src_elem.x;
-        dst1_y[x] = src_elem.y;
-        dst2_y[x] = src_elem.z;
-        dst3_y[x] = src_elem.w;
-    }
-}
+            if (x < cols && y < rows) 
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+                dst2_y[x] = src_elem.z;
+                dst3_y[x] = src_elem.w;
+            }
+        }


-template <>
-__global__ void splitC4_<double>(
-        const uchar* src, size_t src_step, int rows, int cols,
-        uchar* dst0, size_t dst0_step,
-        uchar* dst1, size_t dst1_step,
-        uchar* dst2, size_t dst2_step,
-        uchar* dst3, size_t dst3_step)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        template <>
+        __global__ void splitC4_<double>(
+                const uchar* src, size_t src_step, int rows, int cols,
+                uchar* dst0, size_t dst0_step,
+                uchar* dst1, size_t dst1_step,
+                uchar* dst2, size_t dst2_step,
+                uchar* dst3, size_t dst3_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-    const double2* src_y = (const double2*)(src + y * src_step);
-    double* dst0_y = (double*)(dst0 + y * dst0_step);
-    double* dst1_y = (double*)(dst1 + y * dst1_step);
-    double* dst2_y = (double*)(dst2 + y * dst2_step);
-    double* dst3_y = (double*)(dst3 + y * dst3_step);
+            const double2* src_y = (const double2*)(src + y * src_step);
+            double* dst0_y = (double*)(dst0 + y * dst0_step);
+            double* dst1_y = (double*)(dst1 + y * dst1_step);
+            double* dst2_y = (double*)(dst2 + y * dst2_step);
+            double* dst3_y = (double*)(dst3 + y * dst3_step);

-    if (x < cols && y < rows) 
-    {
-        double2 src_elem1 = src_y[2 * x];
-        double2 src_elem2 = src_y[2 * x + 1];
-        dst0_y[x] = src_elem1.x;
-        dst1_y[x] = src_elem1.y;
-        dst2_y[x] = src_elem2.x;
-        dst3_y[x] = src_elem2.y;
-    }
-}
+            if (x < cols && y < rows) 
+            {
+                double2 src_elem1 = src_y[2 * x];
+                double2 src_elem2 = src_y[2 * x + 1];
+                dst0_y[x] = src_elem1.x;
+                dst1_y[x] = src_elem1.y;
+                dst2_y[x] = src_elem2.x;
+                dst3_y[x] = src_elem2.y;
+            }
+        }

-template <typename T>
-static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-    splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
-            src.data, src.step, src.rows, src.cols,
-            dst[0].data, dst[0].step,
-            dst[1].data, dst[1].step);
-    cudaSafeCall( cudaGetLastError() );
+        template <typename T>
+        static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src.data, src.step, src.rows, src.cols,
+                    dst[0].data, dst[0].step,
+                    dst[1].data, dst[1].step);
+            cudaSafeCall( cudaGetLastError() );

-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }


-template <typename T>
-static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-    splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
-            src.data, src.step, src.rows, src.cols,
-            dst[0].data, dst[0].step,
-            dst[1].data, dst[1].step,
-            dst[2].data, dst[2].step);
-    cudaSafeCall( cudaGetLastError() );
+        template <typename T>
+        static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+                    src.data, src.step, src.rows, src.cols,
+                    dst[0].data, dst[0].step,
+                    dst[1].data, dst[1].step,
+                    dst[2].data, dst[2].step);
+            cudaSafeCall( cudaGetLastError() );

-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }


-template <typename T>
-static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-{
-    dim3 blockDim(32, 8);
-    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-    splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
-             src.data, src.step, src.rows, src.cols,
-             dst[0].data, dst[0].step,
-             dst[1].data, dst[1].step,
-             dst[2].data, dst[2].step,
-             dst[3].data, dst[3].step);
-    cudaSafeCall( cudaGetLastError() );
+        template <typename T>
+        static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+        {
+            dim3 blockDim(32, 8);
+            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+                     src.data, src.step, src.rows, src.cols,
+                     dst[0].data, dst[0].step,
+                     dst[1].data, dst[1].step,
+                     dst[2].data, dst[2].step,
+                     dst[3].data, dst[3].step);
+            cudaSafeCall( cudaGetLastError() );

-    if (stream == 0)
-        cudaSafeCall(cudaDeviceSynchronize());
-}
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }


-void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
-{
-    static SplitFunction split_func_tbl[] =
-    {
-        splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
-        splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
-        splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
-    };
+        void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+        {
+            static SplitFunction split_func_tbl[] =
+            {
+                splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
+                splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
+                splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
+            };

-    size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
-    SplitFunction split_func = split_func_tbl[split_func_id];
+            size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
+            SplitFunction split_func = split_func_tbl[split_func_id];

-    if (split_func == 0)
-        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+            if (split_func == 0)
+                cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);

-    split_func(src, dst, stream);
-}
-
-} // namespace split_merge
-
-END_OPENCV_DEVICE_NAMESPACE
+            split_func(src, dst, stream);
+        }
+    } // namespace split_merge
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -42,496 +42,494 @@

 #include "internal_shared.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereobm {
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-#define ROWSperTHREAD 21     // the number of rows a thread will process
-
-#define BLOCK_W 128          // the thread block width (464)
-#define N_DISPARITIES 8
-
-#define STEREO_MIND 0                    // The minimum d range to check
-#define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
-
-__constant__ unsigned int* cminSSDImage;
-__constant__ size_t cminSSD_step;
-__constant__ int cwidth;
-__constant__ int cheight;
-
-__device__ __forceinline__ int SQ(int a)
+namespace cv { namespace gpu { namespace device 
 {
-    return a * a;
-}
-
-template<int RADIUS>
-__device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-{	
-    unsigned int cache = 0;
-    unsigned int cache2 = 0;
-
-    for(int i = 1; i <= RADIUS; i++)
-        cache += col_ssd[i];
-
-    col_ssd_cache[0] = cache;
-
-    __syncthreads();
-
-    if (threadIdx.x < BLOCK_W - RADIUS)
-        cache2 = col_ssd_cache[RADIUS];
-    else
-        for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
-            cache2 += col_ssd[i];
-
-    return col_ssd[0] + cache + cache2;
-}
-
-template<int RADIUS>
-__device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-{
-    unsigned int ssd[N_DISPARITIES];
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
-    __syncthreads();
-    ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
-
-    int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
-
-    int bestIdx = 0;
-    for (int i = 0; i < N_DISPARITIES; i++)
+    namespace stereobm 
    {
-        if (mssd == ssd[i])
-            bestIdx = i;
-    }
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////

-    return make_uint2(mssd, bestIdx);
-}
+        #define ROWSperTHREAD 21     // the number of rows a thread will process

-template<int RADIUS>
-__device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-{
-    unsigned char leftPixel1;
-    unsigned char leftPixel2;
-    unsigned char rightPixel1[8];
-    unsigned char rightPixel2[8];
-    unsigned int diff1, diff2;
+        #define BLOCK_W 128          // the thread block width (464)
+        #define N_DISPARITIES 8

-    leftPixel1 = imageL[idx1];
-    leftPixel2 = imageL[idx2];
+        #define STEREO_MIND 0                    // The minimum d range to check
+        #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing

-    idx1 = idx1 - d;
-    idx2 = idx2 - d;
+        __constant__ unsigned int* cminSSDImage;
+        __constant__ size_t cminSSD_step;
+        __constant__ int cwidth;
+        __constant__ int cheight;

-    rightPixel1[7] = imageR[idx1 - 7];
-    rightPixel1[0] = imageR[idx1 - 0];
-    rightPixel1[1] = imageR[idx1 - 1];
-    rightPixel1[2] = imageR[idx1 - 2];
-    rightPixel1[3] = imageR[idx1 - 3];
-    rightPixel1[4] = imageR[idx1 - 4];
-    rightPixel1[5] = imageR[idx1 - 5];
-    rightPixel1[6] = imageR[idx1 - 6];
-
-    rightPixel2[7] = imageR[idx2 - 7];
-    rightPixel2[0] = imageR[idx2 - 0];
-    rightPixel2[1] = imageR[idx2 - 1];
-    rightPixel2[2] = imageR[idx2 - 2];
-    rightPixel2[3] = imageR[idx2 - 3];
-    rightPixel2[4] = imageR[idx2 - 4];
-    rightPixel2[5] = imageR[idx2 - 5];
-    rightPixel2[6] = imageR[idx2 - 6];
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    diff1 = leftPixel1 - rightPixel1[0];
-    diff2 = leftPixel2 - rightPixel2[0];
-    col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[1];
-    diff2 = leftPixel2 - rightPixel2[1];
-    col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[2];
-    diff2 = leftPixel2 - rightPixel2[2];
-    col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[3];
-    diff2 = leftPixel2 - rightPixel2[3];
-    col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[4];
-    diff2 = leftPixel2 - rightPixel2[4];
-    col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[5];
-    diff2 = leftPixel2 - rightPixel2[5];
-    col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[6];
-    diff2 = leftPixel2 - rightPixel2[6];
-    col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-    diff1 = leftPixel1 - rightPixel1[7];
-    diff2 = leftPixel2 - rightPixel2[7];
-    col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-}
-
-template<int RADIUS>
-__device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-{
-    unsigned char leftPixel1;
-    int idx;
-    unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
-    for(int i = 0; i < (2 * RADIUS + 1); i++)
-    {
-        idx = y_tex * im_pitch + x_tex;
-        leftPixel1 = imageL[idx];
-        idx = idx - d;
-
-        diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
-        diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
-        diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
-        diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
-        diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
-        diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
-        diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
-        diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
-
-        y_tex += 1;
-    }
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
-    col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
-    col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
-    col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
-    col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
-    col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
-    col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
-    col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
-}
-
-template<int RADIUS>
-__global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
-{
-    extern __shared__ unsigned int col_ssd_cache[];
-    volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
-    volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
-
-    //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
-    int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
-    //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
-    #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
-    //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
-
-    unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
-    unsigned char* disparImage = disp.data + X + Y * disp.step;
- /*   if (X < cwidth)
-    {
-        unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
-        for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
-            *ptr = 0xFFFFFFFF;
-    }*/
-    int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
-    int y_tex;
-    int x_tex = X - RADIUS;
-
-    if (x_tex >= cwidth)
-        return;
-
-    for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
-    {
-        y_tex = Y - RADIUS;
-
-        InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
-
-        if (col_ssd_extra > 0)
-            if (x_tex + BLOCK_W < cwidth)
-                InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
-
-        __syncthreads(); //before MinSSD function
-
-        if (X < cwidth - RADIUS && Y < cheight - RADIUS)
+        __device__ __forceinline__ int SQ(int a)
        {
-            uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-            if (minSSD.x < minSSDImage[0])
-            {
-                disparImage[0] = (unsigned char)(d + minSSD.y);
-                minSSDImage[0] = minSSD.x;
-            }
+            return a * a;
        }

-        for(int row = 1; row < end_row; row++)
-        {
-            int idx1 = y_tex * img_step + x_tex;
-            int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
+        template<int RADIUS>
+        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {	
+            unsigned int cache = 0;
+            unsigned int cache2 = 0;
+
+            for(int i = 1; i <= RADIUS; i++)
+                cache += col_ssd[i];
+
+            col_ssd_cache[0] = cache;

            __syncthreads();

-            StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
+            if (threadIdx.x < BLOCK_W - RADIUS)
+                cache2 = col_ssd_cache[RADIUS];
+            else
+                for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
+                    cache2 += col_ssd[i];

-            if (col_ssd_extra)
-                if (x_tex + BLOCK_W < cwidth)
-                    StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
+            return col_ssd[0] + cache + cache2;
+        }

-            y_tex += 1;
+        template<int RADIUS>
+        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        {
+            unsigned int ssd[N_DISPARITIES];

-            __syncthreads(); //before MinSSD function
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
+            __syncthreads();
+            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));

-            if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
+            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
+
+            int bestIdx = 0;
+            for (int i = 0; i < N_DISPARITIES; i++)
            {
-                int idx = row * cminSSD_step;
-                uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                if (minSSD.x < minSSDImage[idx])
+                if (mssd == ssd[i])
+                    bestIdx = i;
+            }
+
+            return make_uint2(mssd, bestIdx);
+        }
+
+        template<int RADIUS>
+        __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
+        {
+            unsigned char leftPixel1;
+            unsigned char leftPixel2;
+            unsigned char rightPixel1[8];
+            unsigned char rightPixel2[8];
+            unsigned int diff1, diff2;
+
+            leftPixel1 = imageL[idx1];
+            leftPixel2 = imageL[idx2];
+
+            idx1 = idx1 - d;
+            idx2 = idx2 - d;
+
+            rightPixel1[7] = imageR[idx1 - 7];
+            rightPixel1[0] = imageR[idx1 - 0];
+            rightPixel1[1] = imageR[idx1 - 1];
+            rightPixel1[2] = imageR[idx1 - 2];
+            rightPixel1[3] = imageR[idx1 - 3];
+            rightPixel1[4] = imageR[idx1 - 4];
+            rightPixel1[5] = imageR[idx1 - 5];
+            rightPixel1[6] = imageR[idx1 - 6];
+
+            rightPixel2[7] = imageR[idx2 - 7];
+            rightPixel2[0] = imageR[idx2 - 0];
+            rightPixel2[1] = imageR[idx2 - 1];
+            rightPixel2[2] = imageR[idx2 - 2];
+            rightPixel2[3] = imageR[idx2 - 3];
+            rightPixel2[4] = imageR[idx2 - 4];
+            rightPixel2[5] = imageR[idx2 - 5];
+            rightPixel2[6] = imageR[idx2 - 6];
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            diff1 = leftPixel1 - rightPixel1[0];
+            diff2 = leftPixel2 - rightPixel2[0];
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[1];
+            diff2 = leftPixel2 - rightPixel2[1];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[2];
+            diff2 = leftPixel2 - rightPixel2[2];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[3];
+            diff2 = leftPixel2 - rightPixel2[3];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[4];
+            diff2 = leftPixel2 - rightPixel2[4];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[5];
+            diff2 = leftPixel2 - rightPixel2[5];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[6];
+            diff2 = leftPixel2 - rightPixel2[6];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+
+            diff1 = leftPixel1 - rightPixel1[7];
+            diff2 = leftPixel2 - rightPixel2[7];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
+        }
+
+        template<int RADIUS>
+        __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
+        {
+            unsigned char leftPixel1;
+            int idx;
+            unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+            for(int i = 0; i < (2 * RADIUS + 1); i++)
+            {
+                idx = y_tex * im_pitch + x_tex;
+                leftPixel1 = imageL[idx];
+                idx = idx - d;
+
+                diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
+                diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
+                diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
+                diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
+                diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
+                diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
+                diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
+                diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
+
+                y_tex += 1;
+            }
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
+            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
+            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
+            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
+            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
+            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
+            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
+            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
+        }
+
+        template<int RADIUS>
+        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
+        {
+            extern __shared__ unsigned int col_ssd_cache[];
+            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
+            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
+
+            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
+            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
+            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
+            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
+            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
+
+            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
+            unsigned char* disparImage = disp.data + X + Y * disp.step;
+         /*   if (X < cwidth)
+            {
+                unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
+                for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
+                    *ptr = 0xFFFFFFFF;
+            }*/
+            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
+            int y_tex;
+            int x_tex = X - RADIUS;
+
+            if (x_tex >= cwidth)
+                return;
+
+            for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
+            {
+                y_tex = Y - RADIUS;
+
+                InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
+
+                if (col_ssd_extra > 0)
+                    if (x_tex + BLOCK_W < cwidth)
+                        InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
+
+                __syncthreads(); //before MinSSD function
+
+                if (X < cwidth - RADIUS && Y < cheight - RADIUS)
                {
-                    disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
-                    minSSDImage[idx] = minSSD.x;
+                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                    if (minSSD.x < minSSDImage[0])
+                    {
+                        disparImage[0] = (unsigned char)(d + minSSD.y);
+                        minSSDImage[0] = minSSD.x;
+                    }
+                }
+
+                for(int row = 1; row < end_row; row++)
+                {
+                    int idx1 = y_tex * img_step + x_tex;
+                    int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
+
+                    __syncthreads();
+
+                    StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
+
+                    if (col_ssd_extra)
+                        if (x_tex + BLOCK_W < cwidth)
+                            StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
+
+                    y_tex += 1;
+
+                    __syncthreads(); //before MinSSD function
+
+                    if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
+                    {
+                        int idx = row * cminSSD_step;
+                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
+                        if (minSSD.x < minSSDImage[idx])
+                        {
+                            disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
+                            minSSDImage[idx] = minSSD.x;
+                        }
+                    }
+                } // for row loop
+            } // for d loop
+        }
+
+
+        template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
+        {
+            dim3 grid(1,1,1);
+            dim3 threads(BLOCK_W, 1, 1);
+
+            grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
+            grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
+
+            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
+            size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
+
+            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        };
+
+        typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
+
+        const static kernel_caller_t callers[] =
+        {
+            0,
+            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
+            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
+            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
+            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
+            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
+
+            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
+        };
+        const int calles_num = sizeof(callers)/sizeof(callers[0]);
+
+        void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
+        {
+            int winsz2 = winsz >> 1;
+
+            if (winsz2 == 0 || winsz2 >= calles_num)
+                cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
+
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
+            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
+
+            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
+            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
+
+            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
+
+            size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
+            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
+
+            callers[winsz2](left, right, disp, maxdisp, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
+
+        __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
+        {
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < output.cols && y < output.rows)
+            {
+                int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
+                           (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
+                           (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
+
+
+                conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
+                output.ptr(y)[x] = conv & 0xFF;
+            }
+        }
+
+        void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
+
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, threads.y);
+
+            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)   
+                cudaSafeCall( cudaDeviceSynchronize() );    
+
+            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
+        }
+
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
+        //////////////////////////////////////////////////////////////////////////////////////////////////
+
+        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
+
+        __device__ __forceinline__ float sobel(int x, int y)
+        {
+            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
+                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
+                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
+            return fabs(conv);
+        }
+
+        __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
+        {
+            float cache = 0;
+            float cache2 = 0;
+            int winsz2 = winsz/2;
+
+            for(int i = 1; i <= winsz2; i++)
+                cache += cols[i];
+
+            cols_cache[0] = cache;
+
+            __syncthreads();
+
+            if (threadIdx.x < blockDim.x - winsz2)
+                cache2 = cols_cache[winsz2];
+            else
+                for(int i = winsz2 + 1; i < winsz; i++)
+                    cache2 += cols[i];
+
+            return cols[0] + cache + cache2;
+        }
+
+        #define RpT (2 * ROWSperTHREAD)  // got experimentally
+
+        __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
+        {
+            int winsz2 = winsz/2;
+            int n_dirty_pixels = (winsz2) * 2;
+
+            extern __shared__ float cols_cache[];
+            float *cols = cols_cache + blockDim.x + threadIdx.x;
+            float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
+
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int beg_row = blockIdx.y * RpT;
+            int end_row = ::min(beg_row + RpT, disp.rows);
+
+            if (x < disp.cols)
+            {
+                int y = beg_row;
+
+                float sum = 0;
+                float sum_extra = 0;
+
+                for(int i = y - winsz2; i <= y + winsz2; ++i)
+                {
+                    sum += sobel(x - winsz2, i);
+                    if (cols_extra)
+                        sum_extra += sobel(x + blockDim.x - winsz2, i);
+                }
+                *cols = sum;
+                if (cols_extra)
+                    *cols_extra = sum_extra;
+
+                __syncthreads();
+
+                float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                if (sum_win < threshold)
+                    disp.data[y * disp.step + x] = 0;
+
+                __syncthreads();
+
+                for(int y = beg_row + 1; y < end_row; ++y)
+                {
+                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
+                    *cols = sum;
+
+                    if (cols_extra)
+                    {
+                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
+                        *cols_extra = sum_extra;
+                    }
+
+                    __syncthreads();
+                    float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
+                    if (sum_win < threshold)
+                        disp.data[y * disp.step + x] = 0;
+
+                    __syncthreads();
                }
            }
-        } // for row loop
-    } // for d loop
-}
-
-
-template<int RADIUS> void kernel_caller(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream)
-{
-    dim3 grid(1,1,1);
-    dim3 threads(BLOCK_W, 1, 1);
-
-    grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
-    grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
-
-    //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-    size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
-
-    stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-};
-
-typedef void (*kernel_caller_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, cudaStream_t & stream);
-
-const static kernel_caller_t callers[] =
-{
-    0,
-    kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
-    kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
-    kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
-    kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
-    kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
-
-    //0,0,0, 0,0,0, 0,0,kernel_caller<9>
-};
-const int calles_num = sizeof(callers)/sizeof(callers[0]);
-
-void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
-{
-    int winsz2 = winsz >> 1;
-
-    if (winsz2 == 0 || winsz2 >= calles_num)
-        cv::gpu::error("Unsupported window size", __FILE__, __LINE__);
-
-    //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
-    //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
-
-    cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
-    cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
-
-    cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
-    cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
-    cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
-
-    size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
-    cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
-
-    callers[winsz2](left, right, disp, maxdisp, stream);
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
-
-__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
-{
-    int x = blockDim.x * blockIdx.x + threadIdx.x;
-    int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (x < output.cols && y < output.rows)
-    {
-        int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
-                   (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
-                   (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
-
-
-        conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
-        output.ptr(y)[x] = conv & 0xFF;
-    }
-}
-
-void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
-{
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-    cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
-
-    dim3 threads(16, 16, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(input.cols, threads.x);
-    grid.y = divUp(input.rows, threads.y);
-
-    prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)   
-        cudaSafeCall( cudaDeviceSynchronize() );    
-
-    cudaSafeCall( cudaUnbindTexture (texForSobel ) );
-}
-
-
-//////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////// Textureness filtering ////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////////////////
-
-texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
-
-__device__ __forceinline__ float sobel(int x, int y)
-{
-    float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
-                 tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
-                 tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
-    return fabs(conv);
-}
-
-__device__ float CalcSums(float *cols, float *cols_cache, int winsz)
-{
-    float cache = 0;
-    float cache2 = 0;
-    int winsz2 = winsz/2;
-
-    for(int i = 1; i <= winsz2; i++)
-        cache += cols[i];
-
-    cols_cache[0] = cache;
-
-    __syncthreads();
-
-    if (threadIdx.x < blockDim.x - winsz2)
-        cache2 = cols_cache[winsz2];
-    else
-        for(int i = winsz2 + 1; i < winsz; i++)
-            cache2 += cols[i];
-
-    return cols[0] + cache + cache2;
-}
-
-#define RpT (2 * ROWSperTHREAD)  // got experimentally
-
-__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
-{
-    int winsz2 = winsz/2;
-    int n_dirty_pixels = (winsz2) * 2;
-
-    extern __shared__ float cols_cache[];
-    float *cols = cols_cache + blockDim.x + threadIdx.x;
-    float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
-
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int beg_row = blockIdx.y * RpT;
-    int end_row = ::min(beg_row + RpT, disp.rows);
-
-    if (x < disp.cols)
-    {
-        int y = beg_row;
-
-        float sum = 0;
-        float sum_extra = 0;
-
-        for(int i = y - winsz2; i <= y + winsz2; ++i)
-        {
-            sum += sobel(x - winsz2, i);
-            if (cols_extra)
-                sum_extra += sobel(x + blockDim.x - winsz2, i);
        }
-        *cols = sum;
-        if (cols_extra)
-            *cols_extra = sum_extra;

-        __syncthreads();
-
-        float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-        if (sum_win < threshold)
-            disp.data[y * disp.step + x] = 0;
-
-        __syncthreads();
-
-        for(int y = beg_row + 1; y < end_row; ++y)
+        void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
        {
-            sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
-            *cols = sum;
+            avgTexturenessThreshold *= winsz * winsz;

-            if (cols_extra)
-            {
-                sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
-                *cols_extra = sum_extra;
-            }
+            texForTF.filterMode     = cudaFilterModeLinear;
+            texForTF.addressMode[0] = cudaAddressModeWrap;
+            texForTF.addressMode[1] = cudaAddressModeWrap;

-            __syncthreads();
-            float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-            if (sum_win < threshold)
-                disp.data[y * disp.step + x] = 0;
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
+            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );

-            __syncthreads();
+            dim3 threads(128, 1, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(input.cols, threads.x);
+            grid.y = divUp(input.rows, RpT);
+
+            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
+            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaUnbindTexture (texForTF) );
        }
-    }
-}
-
-void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
-{
-    avgTexturenessThreshold *= winsz * winsz;
-
-    texForTF.filterMode     = cudaFilterModeLinear;
-    texForTF.addressMode[0] = cudaAddressModeWrap;
-    texForTF.addressMode[1] = cudaAddressModeWrap;
-
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-    cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
-
-    dim3 threads(128, 1, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(input.cols, threads.x);
-    grid.y = divUp(input.rows, RpT);
-
-    size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
-    textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-
-    cudaSafeCall( cudaUnbindTexture (texForTF) );
-}
-
-} // namespace stereobm
-
-END_OPENCV_DEVICE_NAMESPACE
+    } // namespace stereobm
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -44,489 +44,487 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-BEGIN_OPENCV_DEVICE_NAMESPACE
-
-namespace stereobp {
-
-///////////////////////////////////////////////////////////////
-/////////////////////// load constants ////////////////////////
-///////////////////////////////////////////////////////////////
-
-__constant__ int   cndisp;
-__constant__ float cmax_data_term;
-__constant__ float cdata_weight;
-__constant__ float cmax_disc_term;
-__constant__ float cdisc_single_jump;
-
-void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
+namespace cv { namespace gpu { namespace device 
 {
-    cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-    cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-}
-
-///////////////////////////////////////////////////////////////
-////////////////////////// comp data //////////////////////////
-///////////////////////////////////////////////////////////////
-
-template <int cn> struct PixDiff;
-template <> struct PixDiff<1>
-{
-    __device__ __forceinline__ PixDiff(const uchar* ls)
+    namespace stereobp 
    {
-        l = *ls;
-    }
-    __device__ __forceinline__ float operator()(const uchar* rs) const
-    {
-        return ::abs((int)l - *rs);
-    }
-    uchar l;
-};
-template <> struct PixDiff<3>
-{
-    __device__ __forceinline__ PixDiff(const uchar* ls)
-    {
-        l = *((uchar3*)ls);
-    }
-    __device__ __forceinline__ float operator()(const uchar* rs) const
-    {
-        const float tr = 0.299f;
-        const float tg = 0.587f;
-        const float tb = 0.114f;
+        ///////////////////////////////////////////////////////////////
+        /////////////////////// load constants ////////////////////////
+        ///////////////////////////////////////////////////////////////

-        float val  = tb * ::abs((int)l.x - rs[0]);
-              val += tg * ::abs((int)l.y - rs[1]);
-              val += tr * ::abs((int)l.z - rs[2]);
+        __constant__ int   cndisp;
+        __constant__ float cmax_data_term;
+        __constant__ float cdata_weight;
+        __constant__ float cmax_disc_term;
+        __constant__ float cdisc_single_jump;

-        return val;
-    }
-    uchar3 l;
-};
-template <> struct PixDiff<4>
-{
-    __device__ __forceinline__ PixDiff(const uchar* ls)
-    {
-        l = *((uchar4*)ls);
-    }
-    __device__ __forceinline__ float operator()(const uchar* rs) const
-    {
-        const float tr = 0.299f;
-        const float tg = 0.587f;
-        const float tb = 0.114f;
-
-        uchar4 r = *((uchar4*)rs);
-
-        float val  = tb * ::abs((int)l.x - r.x);
-              val += tg * ::abs((int)l.y - r.y);
-              val += tr * ::abs((int)l.z - r.z);
-
-        return val;
-    }
-    uchar4 l;
-};
-
-template <int cn, typename D>
-__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
-    {
-        const uchar* ls = left.ptr(y) + x * cn;
-        const PixDiff<cn> pixDiff(ls);
-        const uchar* rs = right.ptr(y) + x * cn;
-
-        D* ds = data.ptr(y) + x;
-        const size_t disp_step = data.step * left.rows;
-
-        for (int disp = 0; disp < cndisp; disp++)
+        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
        {
-            if (x - disp >= 1)
-            {
-                float val = pixDiff(rs - disp * cn);
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
+        }

-                ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
+        ///////////////////////////////////////////////////////////////
+        ////////////////////////// comp data //////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <int cn> struct PixDiff;
+        template <> struct PixDiff<1>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *ls;
            }
-            else
+            __device__ __forceinline__ float operator()(const uchar* rs) const
            {
-                ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
+                return ::abs((int)l - *rs);
            }
-        }
-    }
-}
-
-template<typename T, typename D>
-void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
-
-template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
-
-    comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
-
-    comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
-
-    comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
-
-    comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
-
-    comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(left.cols, threads.x);
-    grid.y = divUp(left.rows, threads.y);
-
-    comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-///////////////////////////////////////////////////////////////
-//////////////////////// data step down ///////////////////////
-///////////////////////////////////////////////////////////////
-
-template <typename T>
-__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        for (int d = 0; d < cndisp; ++d)
+            uchar l;
+        };
+        template <> struct PixDiff<3>
        {
-            float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
-                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
-                  dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
-                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
-
-            dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
-        }
-    }
-}
-
-template<typename T>
-void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(dst_cols, threads.x);
-    grid.y = divUp(dst_rows, threads.y);
-
-    data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
-template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
-
-///////////////////////////////////////////////////////////////
-/////////////////// level up messages  ////////////////////////
-///////////////////////////////////////////////////////////////
-
-template <typename T>
-__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (x < dst_cols && y < dst_rows)
-    {
-        const size_t dst_disp_step = dst.step * dst_rows;
-        const size_t src_disp_step = src.step * src_rows;
-
-        T*       dstr = dst.ptr(y  ) + x;
-        const T* srcr = src.ptr(y/2) + x/2;
-
-        for (int d = 0; d < cndisp; ++d)
-            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
-    }
-}
-
-template <typename T>
-void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(dst_cols, threads.x);
-    grid.y = divUp(dst_rows, threads.y);
-
-    int src_idx = (dst_idx + 1) & 1;
-
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
-
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
-
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
-
-    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
-    cudaSafeCall( cudaGetLastError() );
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
-template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
-
-///////////////////////////////////////////////////////////////
-////////////////////  calc all iterations /////////////////////
-///////////////////////////////////////////////////////////////
-
-template <typename T>
-__device__ void calc_min_linear_penalty(T* dst, size_t step)
-{
-    float prev = dst[0];
-    float cur;
-    for (int disp = 1; disp < cndisp; ++disp)
-    {
-        prev += cdisc_single_jump;
-        cur = dst[step * disp];
-        if (prev < cur)
-        {
-            cur = prev;
-            dst[step * disp] = saturate_cast<T>(prev);
-        }
-        prev = cur;
-    }
-
-    prev = dst[(cndisp - 1) * step];
-    for (int disp = cndisp - 2; disp >= 0; disp--)
-    {
-        prev += cdisc_single_jump;
-        cur = dst[step * disp];
-        if (prev < cur)
-        {
-            cur = prev;
-            dst[step * disp] = saturate_cast<T>(prev);
-        }
-        prev = cur;
-    }
-}
-
-template <typename T>
-__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
-{
-    float minimum = device::numeric_limits<float>::max();
-
-    for(int i = 0; i < cndisp; ++i)
-    {
-        float dst_reg  = msg1[msg_disp_step * i];
-              dst_reg += msg2[msg_disp_step * i];
-              dst_reg += msg3[msg_disp_step * i];
-              dst_reg += data[data_disp_step * i];
-
-        if (dst_reg < minimum)
-            minimum = dst_reg;
-
-        dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
-    }
-
-    calc_min_linear_penalty(dst, msg_disp_step);
-
-    minimum += cmax_disc_term;
-
-    float sum = 0;
-    for(int i = 0; i < cndisp; ++i)
-    {
-        float dst_reg = dst[msg_disp_step * i];
-        if (dst_reg > minimum)
-        {
-            dst_reg = minimum;
-            dst[msg_disp_step * i] = saturate_cast<T>(minimum);
-        }
-        sum += dst_reg;
-    }
-    sum /= cndisp;
-
-    for(int i = 0; i < cndisp; ++i)
-        dst[msg_disp_step * i] -= sum;
-}
-
-template <typename T>
-__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
-{
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
-    {
-        T* us = u.ptr(y) + x;
-        T* ds = d + y * u.step + x;
-        T* ls = l + y * u.step + x;
-        T* rs = r + y * u.step + x;
-        const T* dt = data.ptr(y) + x;
-
-        size_t msg_disp_step = u.step * rows;
-        size_t data_disp_step = data.step * rows;
-
-        message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
-        message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
-        message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
-        message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
-    }
-}
-
-template <typename T>
-void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
-    const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
-
-    grid.x = divUp(cols, threads.x << 1);
-    grid.y = divUp(rows, threads.y);
-
-    for(int t = 0; t < iters; ++t)
-    {
-        one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
-template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
-
-///////////////////////////////////////////////////////////////
-/////////////////////////// output ////////////////////////////
-///////////////////////////////////////////////////////////////
-
-template <typename T>
-__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
-    DevMem2D_<short> disp)
-{
-    const int x = blockIdx.x * blockDim.x + threadIdx.x;
-    const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
-    {
-        const T* us = u.ptr(y + 1) + x;
-        const T* ds = d + (y - 1) * u.step + x;
-        const T* ls = l + y * u.step + (x + 1);
-        const T* rs = r + y * u.step + (x - 1);
-        const T* dt = data + y * u.step + x;
-
-        size_t disp_step = disp.rows * u.step;
-
-        int best = 0;
-        float best_val = numeric_limits<float>::max();
-        for (int d = 0; d < cndisp; ++d)
-        {
-            float val  = us[d * disp_step];
-                  val += ds[d * disp_step];
-                  val += ls[d * disp_step];
-                  val += rs[d * disp_step];
-                  val += dt[d * disp_step];
-
-            if (val < best_val)
+            __device__ __forceinline__ PixDiff(const uchar* ls)
            {
-                best_val = val;
-                best = d;
+                l = *((uchar3*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
+
+                float val  = tb * ::abs((int)l.x - rs[0]);
+                      val += tg * ::abs((int)l.y - rs[1]);
+                      val += tr * ::abs((int)l.z - rs[2]);
+
+                return val;
+            }
+            uchar3 l;
+        };
+        template <> struct PixDiff<4>
+        {
+            __device__ __forceinline__ PixDiff(const uchar* ls)
+            {
+                l = *((uchar4*)ls);
+            }
+            __device__ __forceinline__ float operator()(const uchar* rs) const
+            {
+                const float tr = 0.299f;
+                const float tg = 0.587f;
+                const float tb = 0.114f;
+
+                uchar4 r = *((uchar4*)rs);
+
+                float val  = tb * ::abs((int)l.x - r.x);
+                      val += tg * ::abs((int)l.y - r.y);
+                      val += tr * ::abs((int)l.z - r.z);
+
+                return val;
+            }
+            uchar4 l;
+        };
+
+        template <int cn, typename D>
+        __global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
+            {
+                const uchar* ls = left.ptr(y) + x * cn;
+                const PixDiff<cn> pixDiff(ls);
+                const uchar* rs = right.ptr(y) + x * cn;
+
+                D* ds = data.ptr(y) + x;
+                const size_t disp_step = data.step * left.rows;
+
+                for (int disp = 0; disp < cndisp; disp++)
+                {
+                    if (x - disp >= 1)
+                    {
+                        float val = pixDiff(rs - disp * cn);
+
+                        ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
+                    }
+                    else
+                    {
+                        ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
+                    }
+                }
            }
        }

-        disp.ptr(y)[x] = saturate_cast<short>(best);
-    }
-}
+        template<typename T, typename D>
+        void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);

-template <typename T>
-void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
-    const DevMem2D_<short>& disp, cudaStream_t stream)
-{
-    dim3 threads(32, 8, 1);
-    dim3 grid(1, 1, 1);
+        template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);

-    grid.x = divUp(disp.cols, threads.x);
-    grid.y = divUp(disp.rows, threads.y);
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);

-    output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
-    cudaSafeCall( cudaGetLastError() );
+            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            cudaSafeCall( cudaGetLastError() );

-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);

-template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
-template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);

-} // namespace stereobp
+            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            cudaSafeCall( cudaGetLastError() );

-END_OPENCV_DEVICE_NAMESPACE
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(left.cols, threads.x);
+            grid.y = divUp(left.rows, threads.y);
+
+            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        //////////////////////// data step down ///////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst_cols && y < dst_rows)
+            {
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
+                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
+
+                    dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
+                }
+            }
+        }
+
+        template<typename T>
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
+
+            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////// level up messages  ////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst_cols && y < dst_rows)
+            {
+                const size_t dst_disp_step = dst.step * dst_rows;
+                const size_t src_disp_step = src.step * src_rows;
+
+                T*       dstr = dst.ptr(y  ) + x;
+                const T* srcr = src.ptr(y/2) + x/2;
+
+                for (int d = 0; d < cndisp; ++d)
+                    dstr[d * dst_disp_step] = srcr[d * src_disp_step];
+            }
+        }
+
+        template <typename T>
+        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(dst_cols, threads.x);
+            grid.y = divUp(dst_rows, threads.y);
+
+            int src_idx = (dst_idx + 1) & 1;
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        ////////////////////  calc all iterations /////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __device__ void calc_min_linear_penalty(T* dst, size_t step)
+        {
+            float prev = dst[0];
+            float cur;
+            for (int disp = 1; disp < cndisp; ++disp)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
+
+            prev = dst[(cndisp - 1) * step];
+            for (int disp = cndisp - 2; disp >= 0; disp--)
+            {
+                prev += cdisc_single_jump;
+                cur = dst[step * disp];
+                if (prev < cur)
+                {
+                    cur = prev;
+                    dst[step * disp] = saturate_cast<T>(prev);
+                }
+                prev = cur;
+            }
+        }
+
+        template <typename T>
+        __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
+        {
+            float minimum = device::numeric_limits<float>::max();
+
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg  = msg1[msg_disp_step * i];
+                      dst_reg += msg2[msg_disp_step * i];
+                      dst_reg += msg3[msg_disp_step * i];
+                      dst_reg += data[data_disp_step * i];
+
+                if (dst_reg < minimum)
+                    minimum = dst_reg;
+
+                dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
+            }
+
+            calc_min_linear_penalty(dst, msg_disp_step);
+
+            minimum += cmax_disc_term;
+
+            float sum = 0;
+            for(int i = 0; i < cndisp; ++i)
+            {
+                float dst_reg = dst[msg_disp_step * i];
+                if (dst_reg > minimum)
+                {
+                    dst_reg = minimum;
+                    dst[msg_disp_step * i] = saturate_cast<T>(minimum);
+                }
+                sum += dst_reg;
+            }
+            sum /= cndisp;
+
+            for(int i = 0; i < cndisp; ++i)
+                dst[msg_disp_step * i] -= sum;
+        }
+
+        template <typename T>
+        __global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+            if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+            {
+                T* us = u.ptr(y) + x;
+                T* ds = d + y * u.step + x;
+                T* ls = l + y * u.step + x;
+                T* rs = r + y * u.step + x;
+                const T* dt = data.ptr(y) + x;
+
+                size_t msg_disp_step = u.step * rows;
+                size_t data_disp_step = data.step * rows;
+
+                message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
+                message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
+                message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
+                message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
+            }
+        }
+
+        template <typename T>
+        void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
+            const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(cols, threads.x << 1);
+            grid.y = divUp(rows, threads.y);
+
+            for(int t = 0; t < iters; ++t)
+            {
+                one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
+
+        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+
+        ///////////////////////////////////////////////////////////////
+        /////////////////////////// output ////////////////////////////
+        ///////////////////////////////////////////////////////////////
+
+        template <typename T>
+        __global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
+            DevMem2D_<short> disp)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
+            {
+                const T* us = u.ptr(y + 1) + x;
+                const T* ds = d + (y - 1) * u.step + x;
+                const T* ls = l + y * u.step + (x + 1);
+                const T* rs = r + y * u.step + (x - 1);
+                const T* dt = data + y * u.step + x;
+
+                size_t disp_step = disp.rows * u.step;
+
+                int best = 0;
+                float best_val = numeric_limits<float>::max();
+                for (int d = 0; d < cndisp; ++d)
+                {
+                    float val  = us[d * disp_step];
+                          val += ds[d * disp_step];
+                          val += ls[d * disp_step];
+                          val += rs[d * disp_step];
+                          val += dt[d * disp_step];
+
+                    if (val < best_val)
+                    {
+                        best_val = val;
+                        best = d;
+                    }
+                }
+
+                disp.ptr(y)[x] = saturate_cast<short>(best);
+            }
+        }
+
+        template <typename T>
+        void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
+            const DevMem2D_<short>& disp, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(disp.cols, threads.x);
+            grid.y = divUp(disp.rows, threads.y);
+
+            output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+        template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+    } // namespace stereobp
+}}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu