Merge pull request #1540 from jet47:gpuarithm-cudev

2013-10-21 16:34:44 +04:00
parent e290436a4c 8ed47c01b7
commit 21233656bd
64 changed files with 5250 additions and 8808 deletions
--- a/modules/cudev/include/opencv2/cudev.hpp
+++ b/modules/cudev/include/opencv2/cudev.hpp
@@ -73,7 +73,7 @@
 #include "cudev/block/vec_distance.hpp"

 #include "cudev/grid/copy.hpp"
-#include "cudev/grid/glob_reduce.hpp"
+#include "cudev/grid/reduce.hpp"
 #include "cudev/grid/histogram.hpp"
 #include "cudev/grid/integral.hpp"
 #include "cudev/grid/pyramids.hpp"
--- a/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
+++ b/modules/cudev/include/opencv2/cudev/expr/reduction.hpp
@@ -47,7 +47,7 @@
 #define __OPENCV_CUDEV_EXPR_REDUCTION_HPP__

 #include "../common.hpp"
-#include "../grid/glob_reduce.hpp"
+#include "../grid/reduce.hpp"
 #include "../grid/histogram.hpp"
 #include "../grid/integral.hpp"
 #include "../grid/reduce_to_vec.hpp"
--- a/modules/cudev/include/opencv2/cudev/functional/functional.hpp
+++ b/modules/cudev/include/opencv2/cudev/functional/functional.hpp
@@ -616,6 +616,30 @@ template <typename T> struct magnitude_func : binary_function<T, T, typename fun
    }
 };

+template <typename T> struct magnitude_sqr_func : binary_function<T, T, typename functional_detail::FloatType<T>::type>
+{
+    __device__ __forceinline__ typename functional_detail::FloatType<T>::type operator ()(typename TypeTraits<T>::parameter_type a, typename TypeTraits<T>::parameter_type b) const
+    {
+        return a * a + b * b;
+    }
+};
+
+template <typename T, bool angleInDegrees> struct direction_func : binary_function<T, T, T>
+{
+    __device__ T operator ()(T x, T y) const
+    {
+        atan2_func<T> f;
+        typename atan2_func<T>::result_type angle = f(y, x);
+
+        angle += (angle < 0) * (2.0f * CV_PI_F);
+
+        if (angleInDegrees)
+            angle *= (180.0f / CV_PI_F);
+
+        return saturate_cast<T>(angle);
+    }
+};
+
 template <typename T> struct pow_func : binary_function<T, float, float>
 {
    __device__ __forceinline__ float operator ()(T val, float power) const
--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@@ -594,7 +594,7 @@ namespace integral_detail
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }

-    __host__ static void integral(const GlobPtr<uchar> src, GlobPtr<uint> dst, int rows, int cols, cudaStream_t stream)
+    __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
    {
        if (deviceSupports(FEATURE_SET_COMPUTE_30)
            && (cols % 16 == 0)
@@ -614,7 +614,7 @@ namespace integral_detail
            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }

-    __host__ static void integral(const GlobPtr<uchar> src, GlobPtr<int> dst, int rows, int cols, cudaStream_t stream)
+    __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
    {
        GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
        integral(src, dstui, rows, cols, stream);
--- a/modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/minmaxloc.hpp
@@ -0,0 +1,177 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma once
+
+#ifndef __OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP__
+#define __OPENCV_CUDEV_GRID_MINMAXLOC_DETAIL_HPP__
+
+#include "../../common.hpp"
+#include "../../util/vec_traits.hpp"
+#include "../../util/type_traits.hpp"
+#include "../../util/limits.hpp"
+#include "../../block/reduce.hpp"
+
+namespace cv { namespace cudev {
+
+namespace grid_minmaxloc_detail
+{
+    template <int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
+    __global__ void minMaxLoc_pass_1(const SrcPtr src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr mask, const int rows, const int cols, const int patch_y, const int patch_x)
+    {
+        __shared__ ResType sMinVal[BLOCK_SIZE];
+        __shared__ ResType sMaxVal[BLOCK_SIZE];
+        __shared__ uint sMinLoc[BLOCK_SIZE];
+        __shared__ uint sMaxLoc[BLOCK_SIZE];
+
+        const int x0 = blockIdx.x * blockDim.x * patch_x + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * patch_y + threadIdx.y;
+
+        ResType myMin = numeric_limits<ResType>::max();
+        ResType myMax = -numeric_limits<ResType>::max();
+        int myMinLoc = -1;
+        int myMaxLoc = -1;
+
+        for (int i = 0, y = y0; i < patch_y && y < rows; ++i, y += blockDim.y)
+        {
+            for (int j = 0, x = x0; j < patch_x && x < cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const ResType srcVal = src(y, x);
+
+                    if (srcVal < myMin)
+                    {
+                        myMin = srcVal;
+                        myMinLoc = y * cols + x;
+                    }
+
+                    if (srcVal > myMax)
+                    {
+                        myMax = srcVal;
+                        myMaxLoc = y * cols + x;
+                    }
+                }
+            }
+        }
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
+                                      smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
+                                      tid,
+                                      make_tuple(less<ResType>(), greater<ResType>()));
+
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (tid == 0)
+        {
+            minVal[bid] = myMin;
+            maxVal[bid] = myMax;
+            minLoc[bid] = myMinLoc;
+            maxLoc[bid] = myMaxLoc;
+        }
+    }
+
+    template <int BLOCK_SIZE, typename T>
+    __global__ void minMaxLoc_pass_2(T* minMal, T* maxVal, int* minLoc, int* maxLoc, int count)
+    {
+        __shared__ T sMinVal[BLOCK_SIZE];
+        __shared__ T sMaxVal[BLOCK_SIZE];
+        __shared__ int sMinLoc[BLOCK_SIZE];
+        __shared__ int sMaxLoc[BLOCK_SIZE];
+
+        const int idx = ::min(threadIdx.x, count - 1);
+
+        T myMin = minMal[idx];
+        T myMax = maxVal[idx];
+        int myMinLoc = minLoc[idx];
+        int myMaxLoc = maxLoc[idx];
+
+        blockReduceKeyVal<BLOCK_SIZE>(smem_tuple(sMinVal, sMaxVal), tie(myMin, myMax),
+                                      smem_tuple(sMinLoc, sMaxLoc), tie(myMinLoc, myMaxLoc),
+                                      threadIdx.x,
+                                      make_tuple(less<T>(), greater<T>()));
+
+        if (threadIdx.x == 0)
+        {
+            minMal[0] = myMin;
+            maxVal[0] = myMax;
+            minLoc[0] = myMinLoc;
+            maxLoc[0] = myMaxLoc;
+        }
+    }
+
+    template <class Policy>
+    void getLaunchCfg(int rows, int cols, dim3& block, dim3& grid)
+    {
+        block = dim3(Policy::block_size_x, Policy::block_size_y);
+        grid = dim3(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+    __host__ void minMaxLoc(const SrcPtr& src, ResType* minVal, ResType* maxVal, int* minLoc, int* maxLoc, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    {
+        dim3 block, grid;
+        getLaunchCfg<Policy>(cols, rows, block, grid);
+
+        const int patch_x = divUp(divUp(cols, grid.x), block.x);
+        const int patch_y = divUp(divUp(rows, grid.y), block.y);
+
+        minMaxLoc_pass_1<Policy::block_size_x * Policy::block_size_y><<<grid, block, 0, stream>>>(src, minVal, maxVal, minLoc, maxLoc, mask, rows, cols, patch_y, patch_x);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        minMaxLoc_pass_2<Policy::block_size_x * Policy::block_size_y><<<1, Policy::block_size_x * Policy::block_size_y, 0, stream>>>(minVal, maxVal, minLoc, maxLoc, grid.x * grid.y);
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
+
+        if (stream == 0)
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+}
+
+}}
+
+#endif
--- a/modules/cudev/include/opencv2/cudev/grid/detail/glob_reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/glob_reduce.hpp
@@ -43,8 +43,8 @@

 #pragma once

-#ifndef __OPENCV_CUDEV_GRID_GLOB_REDUCE_DETAIL_HPP__
-#define __OPENCV_CUDEV_GRID_GLOB_REDUCE_DETAIL_HPP__
+#ifndef __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__
+#define __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__

 #include "../../common.hpp"
 #include "../../util/tuple.hpp"
@@ -59,7 +59,7 @@

 namespace cv { namespace cudev {

-namespace grid_glob_reduce_detail
+namespace grid_reduce_detail
 {
    // Unroll

@@ -389,7 +389,7 @@ namespace grid_glob_reduce_detail
    // glob_reduce

    template <class Reductor, int BLOCK_SIZE, int PATCH_X, int PATCH_Y, class SrcPtr, typename ResType, class MaskPtr>
-    __global__ void glob_reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
+    __global__ void reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
    {
        const int x0 = blockIdx.x * blockDim.x * PATCH_X + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * PATCH_Y + threadIdx.y;
@@ -413,14 +413,12 @@ namespace grid_glob_reduce_detail
    }

    template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
-    __host__ void glob_reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    __host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        const dim3 block(Policy::block_size_x, Policy::block_size_y);
        const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));

-        const int BLOCK_SIZE = Policy::block_size_x * Policy::block_size_y;
-
-        glob_reduce<Reductor, BLOCK_SIZE, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
+        reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
@@ -433,40 +431,33 @@ namespace grid_glob_reduce_detail
    __host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        typedef typename PtrTraits<SrcPtr>::value_type src_type;
-        const int cn = VecTraits<src_type>::cn;
-        typedef typename MakeVec<ResType, cn>::type work_type;
+        typedef typename VecTraits<ResType>::elem_type res_elem_type;

-        glob_reduce<SumReductor<src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
+        reduce<SumReductor<src_type, ResType>, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream);
    }

    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
    __host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        typedef typename PtrTraits<SrcPtr>::value_type src_type;
-        const int cn = VecTraits<src_type>::cn;
-        typedef typename MakeVec<ResType, cn>::type work_type;

-        glob_reduce<MinMaxReductor<minop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
+        reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
    }

    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
    __host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        typedef typename PtrTraits<SrcPtr>::value_type src_type;
-        const int cn = VecTraits<src_type>::cn;
-        typedef typename MakeVec<ResType, cn>::type work_type;

-        glob_reduce<MinMaxReductor<maxop<work_type>, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
+        reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
    }

    template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
    __host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        typedef typename PtrTraits<SrcPtr>::value_type src_type;
-        const int cn = VecTraits<src_type>::cn;
-        typedef typename MakeVec<ResType, cn>::type work_type;

-        glob_reduce<MinMaxReductor<both, src_type, work_type>, Policy>(src, result, mask, rows, cols, stream);
+        reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
    }
 }

--- a/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/reduce_to_column.hpp
@@ -54,12 +54,52 @@ namespace cv { namespace cudev {

 namespace grid_reduce_to_vec_detail
 {
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor, int cn> struct Reduce;
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 1>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[1][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem[0], myVal, threadIdx.x, op);
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 2>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[2][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1]), tie(myVal.x, myVal.y), threadIdx.x, make_tuple(op, op));
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 3>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[3][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2]), tie(myVal.x, myVal.y, myVal.z), threadIdx.x, make_tuple(op, op, op));
+        }
+    };
+
+    template <int BLOCK_SIZE, typename work_type, typename work_elem_type, class Reductor> struct Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, 4>
+    {
+        __device__ __forceinline__ static void call(work_elem_type smem[4][BLOCK_SIZE], work_type& myVal)
+        {
+            typename Reductor::template rebind<work_elem_type>::other op;
+            blockReduce<BLOCK_SIZE>(smem_tuple(smem[0], smem[1], smem[2], smem[3]), tie(myVal.x, myVal.y, myVal.z, myVal.w), threadIdx.x, make_tuple(op, op, op, op));
+        }
+    };
+
    template <class Reductor, int BLOCK_SIZE, class SrcPtr, typename ResType, class MaskPtr>
    __global__ void reduceToColumn(const SrcPtr src, ResType* dst, const MaskPtr mask, const int cols)
    {
        typedef typename Reductor::work_type work_type;
+        typedef typename VecTraits<work_type>::elem_type work_elem_type;
+        const int cn = VecTraits<work_type>::cn;

-        __shared__ work_type smem[BLOCK_SIZE];
+        __shared__ work_elem_type smem[cn][BLOCK_SIZE];

        const int y = blockIdx.x;

@@ -75,7 +115,7 @@ namespace grid_reduce_to_vec_detail
            }
        }

-        blockReduce<BLOCK_SIZE>(smem, myVal, threadIdx.x, op);
+        Reduce<BLOCK_SIZE, work_type, work_elem_type, Reductor, cn>::call(smem, myVal);

        if (threadIdx.x == 0)
            dst[y] = saturate_cast<ResType>(Reductor::result(myVal, cols));
--- a/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/transform.hpp
@@ -217,7 +217,7 @@ namespace grid_transform_detail
    }

    template <int SHIFT, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
-    __global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, PtrStep<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
+    __global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, GlobPtr<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols)
    {
        typedef typename MakeVec<SrcType1, SHIFT>::type read_type1;
        typedef typename MakeVec<SrcType2, SHIFT>::type read_type2;
@@ -345,25 +345,25 @@ namespace grid_transform_detail
    };

    template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
-    __host__ void transform(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    __host__ void transform_unary(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream);
    }

    template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
-    __host__ void transform(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    __host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
    }

    template <class Policy, typename SrcType, typename DstType, class UnOp, class MaskPtr>
-    __host__ void transform(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    __host__ void transform_unary(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        TransformDispatcher<VecTraits<SrcType>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src, dst, op, mask, rows, cols, stream);
    }

    template <class Policy, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr>
-    __host__ void transform(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
+    __host__ void transform_binary(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    {
        TransformDispatcher<VecTraits<SrcType1>::cn == 1 && VecTraits<SrcType2>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream);
    }
--- a/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/transpose.hpp
@@ -55,15 +55,12 @@ namespace cv { namespace cudev {

 namespace transpose_detail
 {
-    const int TRANSPOSE_TILE_DIM   = 16;
-    const int TRANSPOSE_BLOCK_ROWS = 16;
-
-    template <class SrcPtr, typename DstType>
+    template <int TILE_DIM, int BLOCK_DIM_Y, class SrcPtr, typename DstType>
    __global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols)
    {
        typedef typename PtrTraits<SrcPtr>::value_type src_type;

-        __shared__ src_type tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
+        __shared__ src_type tile[TILE_DIM][TILE_DIM + 1];

        int blockIdx_x, blockIdx_y;

@@ -80,12 +77,12 @@ namespace transpose_detail
            blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
        }

-        int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
-        int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
+        int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
+        int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;

        if (xIndex < cols)
        {
-            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+            for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
            {
                if (yIndex + i < rows)
                {
@@ -96,12 +93,12 @@ namespace transpose_detail

        __syncthreads();

-        xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
-        yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+        xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
+        yIndex = blockIdx_x * TILE_DIM + threadIdx.y;

        if (xIndex < rows)
        {
-            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+            for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
            {
                if (yIndex + i < cols)
                {
@@ -111,13 +108,13 @@ namespace transpose_detail
        }
    }

-    template <class SrcPtr, typename DstType>
+    template <class Policy, class SrcPtr, typename DstType>
    __host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream)
    {
-        const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
+        const dim3 block(Policy::tile_dim, Policy::block_dim_y);
        const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));

-        transpose<<<grid, block, 0, stream>>>(src, dst, rows, cols);
+        transpose<Policy::tile_dim, Policy::block_dim_y><<<grid, block, 0, stream>>>(src, dst, rows, cols);
        CV_CUDEV_SAFE_CALL( cudaGetLastError() );

        if (stream == 0)
--- a/modules/cudev/include/opencv2/cudev/grid/glob_reduce.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/glob_reduce.hpp
@@ -43,8 +43,8 @@

 #pragma once

-#ifndef __OPENCV_CUDEV_GRID_GLOB_REDUCE_HPP__
-#define __OPENCV_CUDEV_GRID_GLOB_REDUCE_HPP__
+#ifndef __OPENCV_CUDEV_GRID_REDUCE_HPP__
+#define __OPENCV_CUDEV_GRID_REDUCE_HPP__

 #include <limits>
 #include "../common.hpp"
@@ -52,13 +52,18 @@
 #include "../ptr2d/gpumat.hpp"
 #include "../ptr2d/mask.hpp"
 #include "../ptr2d/transform.hpp"
-#include "detail/glob_reduce.hpp"
+#include "detail/reduce.hpp"
+#include "detail/minmaxloc.hpp"

 namespace cv { namespace cudev {

 template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
 __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
+    typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+    CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
+
    dst.create(1, 1);
    dst.setTo(0, stream);

@@ -67,27 +72,31 @@ __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskP

    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    grid_glob_reduce_detail::sum<Policy>(shrinkPtr(src),
-                                         dst[0],
-                                         shrinkPtr(mask),
-                                         rows, cols,
-                                         StreamAccessor::getStream(stream));
+    grid_reduce_detail::sum<Policy>(shrinkPtr(src),
+                                    dst[0],
+                                    shrinkPtr(mask),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType>
 __host__ void gridCalcSum_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream& stream = Stream::Null())
 {
+    typedef typename PtrTraits<SrcPtr>::value_type src_type;
+
+    CV_StaticAssert( unsigned(VecTraits<src_type>::cn) == unsigned(VecTraits<ResType>::cn), "" );
+
    dst.create(1, 1);
    dst.setTo(0, stream);

    const int rows = getRows(src);
    const int cols = getCols(src);

-    grid_glob_reduce_detail::sum<Policy>(shrinkPtr(src),
-                                         dst[0],
-                                         WithOutMask(),
-                                         rows, cols,
-                                         StreamAccessor::getStream(stream));
+    grid_reduce_detail::sum<Policy>(shrinkPtr(src),
+                                    dst[0],
+                                    WithOutMask(),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@@ -101,11 +110,11 @@ __host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const Ma

    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    grid_glob_reduce_detail::minVal<Policy>(shrinkPtr(src),
-                                            dst[0],
-                                            shrinkPtr(mask),
-                                            rows, cols,
-                                            StreamAccessor::getStream(stream));
+    grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       shrinkPtr(mask),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType>
@@ -117,11 +126,11 @@ __host__ void gridFindMinVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
    const int rows = getRows(src);
    const int cols = getCols(src);

-    grid_glob_reduce_detail::minVal<Policy>(shrinkPtr(src),
-                                            dst[0],
-                                            WithOutMask(),
-                                            rows, cols,
-                                            StreamAccessor::getStream(stream));
+    grid_reduce_detail::minVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       WithOutMask(),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@@ -135,11 +144,11 @@ __host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const Ma

    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    grid_glob_reduce_detail::maxVal<Policy>(shrinkPtr(src),
-                                            dst[0],
-                                            shrinkPtr(mask),
-                                            rows, cols,
-                                            StreamAccessor::getStream(stream));
+    grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       shrinkPtr(mask),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType>
@@ -151,11 +160,11 @@ __host__ void gridFindMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream&
    const int rows = getRows(src);
    const int cols = getCols(src);

-    grid_glob_reduce_detail::maxVal<Policy>(shrinkPtr(src),
-                                            dst[0],
-                                            WithOutMask(),
-                                            rows, cols,
-                                            StreamAccessor::getStream(stream));
+    grid_reduce_detail::maxVal<Policy>(shrinkPtr(src),
+                                       dst[0],
+                                       WithOutMask(),
+                                       rows, cols,
+                                       StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@@ -170,11 +179,11 @@ __host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, const

    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    grid_glob_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
-                                               dst[0],
-                                               shrinkPtr(mask),
-                                               rows, cols,
-                                               StreamAccessor::getStream(stream));
+    grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
+                                          dst[0],
+                                          shrinkPtr(mask),
+                                          rows, cols,
+                                          StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType>
@@ -187,11 +196,51 @@ __host__ void gridFindMinMaxVal_(const SrcPtr& src, GpuMat_<ResType>& dst, Strea
    const int rows = getRows(src);
    const int cols = getCols(src);

-    grid_glob_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
-                                               dst[0],
-                                               WithOutMask(),
-                                               rows, cols,
-                                               StreamAccessor::getStream(stream));
+    grid_reduce_detail::minMaxVal<Policy>(shrinkPtr(src),
+                                          dst[0],
+                                          WithOutMask(),
+                                          rows, cols,
+                                          StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    dim3 grid, block;
+    grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
+
+    valBuf.create(2, grid.x * grid.y);
+    locBuf.create(2, grid.x * grid.y);
+
+    grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
+                                             valBuf[0], valBuf[1], locBuf[0], locBuf[1],
+                                             shrinkPtr(mask),
+                                             rows, cols,
+                                             StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename ResType>
+__host__ void gridMinMaxLoc_(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    dim3 grid, block;
+    grid_minmaxloc_detail::getLaunchCfg<Policy>(rows, cols, block, grid);
+
+    valBuf.create(2, grid.x * grid.y);
+    locBuf.create(2, grid.x * grid.y);
+
+    grid_minmaxloc_detail::minMaxLoc<Policy>(shrinkPtr(src),
+                                             valBuf[0], valBuf[1], locBuf[0], locBuf[1],
+                                             WithOutMask(),
+                                             rows, cols,
+                                             StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
@@ -209,11 +258,11 @@ __host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, const
    not_equal_to<src_type> ne_op;
    const src_type zero = VecTraits<src_type>::all(0);

-    grid_glob_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
-                                         dst[0],
-                                         shrinkPtr(mask),
-                                         rows, cols,
-                                         StreamAccessor::getStream(stream));
+    grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
+                                    dst[0],
+                                    shrinkPtr(mask),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename ResType>
@@ -229,11 +278,11 @@ __host__ void gridCountNonZero_(const SrcPtr& src, GpuMat_<ResType>& dst, Stream
    not_equal_to<src_type> ne_op;
    const src_type zero = VecTraits<src_type>::all(0);

-    grid_glob_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
-                                         dst[0],
-                                         WithOutMask(),
-                                         rows, cols,
-                                         StreamAccessor::getStream(stream));
+    grid_reduce_detail::sum<Policy>(shrinkPtr(transformPtr(src, bind2nd(ne_op, zero))),
+                                    dst[0],
+                                    WithOutMask(),
+                                    rows, cols,
+                                    StreamAccessor::getStream(stream));
 }

 // default policy
@@ -297,6 +346,18 @@ __host__ void gridFindMinMaxVal(const SrcPtr& src, GpuMat_<ResType>& dst, Stream
    gridFindMinMaxVal_<DefaultGlobReducePolicy>(src, dst, stream);
 }

+template <class SrcPtr, typename ResType, class MaskPtr>
+__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, mask, stream);
+}
+
+template <class SrcPtr, typename ResType>
+__host__ void gridMinMaxLoc(const SrcPtr& src, GpuMat_<ResType>& valBuf, GpuMat_<int>& locBuf, Stream& stream = Stream::Null())
+{
+    gridMinMaxLoc_<DefaultGlobReducePolicy>(src, valBuf, locBuf, stream);
+}
+
 template <class SrcPtr, typename ResType, class MaskPtr>
 __host__ void gridCountNonZero(const SrcPtr& src, GpuMat_<ResType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
--- a/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/reduce_to_vec.hpp
@@ -49,6 +49,7 @@
 #include "../common.hpp"
 #include "../util/vec_traits.hpp"
 #include "../util/limits.hpp"
+#include "../util/saturate_cast.hpp"
 #include "../ptr2d/traits.hpp"
 #include "../ptr2d/gpumat.hpp"
 #include "../ptr2d/mask.hpp"
@@ -62,6 +63,11 @@ template <typename T> struct Sum : plus<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Sum<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(0);
@@ -77,14 +83,19 @@ template <typename T> struct Avg : plus<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Avg<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(0);
    }

-    __device__ __forceinline__ static T result(T r, int sz)
+    __device__ __forceinline__ static T result(T r, float sz)
    {
-        return r / sz;
+        return saturate_cast<T>(r / sz);
    }
 };

@@ -92,6 +103,11 @@ template <typename T> struct Min : minimum<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Min<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
@@ -107,6 +123,11 @@ template <typename T> struct Max : maximum<T>
 {
    typedef T work_type;

+    template <typename U> struct rebind
+    {
+        typedef Max<U> other;
+    };
+
    __device__ __forceinline__ static T initialValue()
    {
        return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
@@ -158,7 +179,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, cons

    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    createContinuous(rows, 1, DataType<ResType>::type, dst);
+    dst.create(1, rows);

    grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
                                                                dst[0],
@@ -173,7 +194,7 @@ __host__ void gridReduceToColumn_(const SrcPtr& src, GpuMat_<ResType>& dst, Stre
    const int rows = getRows(src);
    const int cols = getCols(src);

-    createContinuous(rows, 1, DataType<ResType>::type, dst);
+    dst.create(1, rows);

    grid_reduce_to_vec_detail::reduceToColumn<Reductor, Policy>(shrinkPtr(src),
                                                                dst[0],
--- a/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/split_merge.hpp
@@ -51,6 +51,7 @@
 #include "../util/vec_traits.hpp"
 #include "../ptr2d/traits.hpp"
 #include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
 #include "../ptr2d/mask.hpp"
 #include "detail/split_merge.hpp"

@@ -75,6 +76,24 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const Ma
                                                                              StreamAccessor::getStream(stream));
 }

+template <class Policy, class SrcPtrTuple, typename DstType, class MaskPtr>
+__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
+                                                                              shrinkPtr(dst),
+                                                                              shrinkPtr(mask),
+                                                                              rows, cols,
+                                                                              StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtrTuple, typename DstType>
 __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@@ -92,6 +111,23 @@ __host__ void gridMerge_(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream&
                                                                              StreamAccessor::getStream(stream));
 }

+template <class Policy, class SrcPtrTuple, typename DstType>
+__host__ void gridMerge_(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<DstType>::cn == tuple_size<SrcPtrTuple>::value, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+
+    grid_split_merge_detail::MergeImpl<VecTraits<DstType>::cn, Policy>::merge(shrinkPtr(src),
+                                                                              shrinkPtr(dst),
+                                                                              WithOutMask(),
+                                                                              rows, cols,
+                                                                              StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
 __host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -132,6 +168,25 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], const Ma
                                           StreamAccessor::getStream(stream));
 }

+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename DstType>
 __host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
 {
@@ -168,6 +223,24 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[2], Stream&
                                           StreamAccessor::getStream(stream));
 }

+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[2], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
 __host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -210,6 +283,26 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], const Ma
                                           StreamAccessor::getStream(stream));
 }

+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename DstType>
 __host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, Stream& stream = Stream::Null())
 {
@@ -248,6 +341,25 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[3], Stream&
                                           StreamAccessor::getStream(stream));
 }

+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[3], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
 __host__ void gridSplit_(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -283,10 +395,31 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], const Ma
    dst[0].create(rows, cols);
    dst[1].create(rows, cols);
    dst[2].create(rows, cols);
-    dst[4].create(rows, cols);
+    dst[3].create(rows, cols);

    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
-                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[4]),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
+                                           shrinkPtr(mask),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+    CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
                                           shrinkPtr(mask),
                                           rows, cols,
                                           StreamAccessor::getStream(stream));
@@ -323,10 +456,30 @@ __host__ void gridSplit_(const SrcPtr& src, GpuMat_<DstType> (&dst)[4], Stream&
    dst[0].create(rows, cols);
    dst[1].create(rows, cols);
    dst[2].create(rows, cols);
-    dst[4].create(rows, cols);
+    dst[3].create(rows, cols);

    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
-                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[4]),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
+                                           WithOutMask(),
+                                           rows, cols,
+                                           StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridSplit_(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[4], Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( VecTraits<typename PtrTraits<SrcPtr>::value_type>::cn == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst[0]) == rows && getCols(dst[0]) == cols );
+    CV_Assert( getRows(dst[1]) == rows && getCols(dst[1]) == cols );
+    CV_Assert( getRows(dst[2]) == rows && getCols(dst[2]) == cols );
+    CV_Assert( getRows(dst[3]) == rows && getCols(dst[3]) == cols );
+
+    grid_split_merge_detail::split<Policy>(shrinkPtr(src),
+                                           shrinkPtr(dst[0]), shrinkPtr(dst[1]), shrinkPtr(dst[2]), shrinkPtr(dst[3]),
                                           WithOutMask(),
                                           rows, cols,
                                           StreamAccessor::getStream(stream));
@@ -348,12 +501,24 @@ __host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, const Mas
    gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
 }

+template <class SrcPtrTuple, typename DstType, class MaskPtr>
+__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridMerge_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
 template <class SrcPtrTuple, typename DstType>
 __host__ void gridMerge(const SrcPtrTuple& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
    gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
 }

+template <class SrcPtrTuple, typename DstType>
+__host__ void gridMerge(const SrcPtrTuple& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridMerge_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
 template <class SrcPtr, typename DstType, class MaskPtr>
 __host__ void gridSplit(const SrcPtr& src, const tuple< GpuMat_<DstType>&, GpuMat_<DstType>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -396,12 +561,24 @@ __host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], const
    gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
 }

+template <class SrcPtr, typename DstType, int COUNT, class MaskPtr>
+__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, mask, stream);
+}
+
 template <class SrcPtr, typename DstType, int COUNT>
 __host__ void gridSplit(const SrcPtr& src, GpuMat_<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
 {
    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
 }

+template <class SrcPtr, typename DstType, int COUNT>
+__host__ void gridSplit(const SrcPtr& src, GlobPtrSz<DstType> (&dst)[COUNT], Stream& stream = Stream::Null())
+{
+    gridSplit_<DefaultSplitMergePolicy>(src, dst, stream);
+}
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/grid/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
@@ -58,7 +58,7 @@
 namespace cv { namespace cudev {

 template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src);
    const int cols = getCols(src);
@@ -67,11 +67,11 @@ __host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnO

    dst.create(rows, cols);

-    grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src);
    const int cols = getCols(src);
@@ -79,33 +79,33 @@ __host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, c
    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename DstType, class UnOp>
-__host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src);
    const int cols = getCols(src);

    dst.create(rows, cols);

-    grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename DstType, class UnOp>
-__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src);
    const int cols = getCols(src);

    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );

-    grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_unary<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src1);
    const int cols = getCols(src1);
@@ -115,11 +115,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D

    dst.create(rows, cols);

-    grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src1);
    const int cols = getCols(src1);
@@ -128,11 +128,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, const Glo
    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );

-    grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
-__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src1);
    const int cols = getCols(src1);
@@ -141,11 +141,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D

    dst.create(rows, cols);

-    grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
-__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src1);
    const int cols = getCols(src1);
@@ -153,11 +153,11 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GlobPtrSz
    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );

-    grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+    grid_transform_detail::transform_binary<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );

@@ -178,7 +178,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );

@@ -198,7 +198,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );

@@ -217,7 +217,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );

@@ -236,7 +236,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );

@@ -258,7 +258,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );

@@ -279,7 +279,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );

@@ -299,7 +299,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );

@@ -319,7 +319,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );

@@ -342,7 +342,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );

@@ -364,7 +364,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, Glob
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );

@@ -385,7 +385,7 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
 }

 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
-__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );

@@ -417,123 +417,123 @@ struct DefaultTransformPolicy
 };

 template <class SrcPtr, typename DstType, class Op, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename DstType, class Op, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename DstType, class Op>
-__host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 template <class SrcPtr, typename DstType, class Op>
-__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+__host__ void gridTransformUnary(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformUnary_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
-__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
 }

 template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
-__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
 }

 template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
-__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
 }

 template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
-__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+__host__ void gridTransformBinary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
+    gridTransformBinary_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
 }

 template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename D0, typename D1, class OpTuple>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 template <class SrcPtr, typename D0, typename D1, class OpTuple>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
-__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+__host__ void gridTransformTuple(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
-    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+    gridTransformTuple_<DefaultTransformPolicy>(src, dst, op, stream);
 }

 }}
--- a/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transpose.hpp
@@ -49,19 +49,53 @@
 #include "../common.hpp"
 #include "../ptr2d/traits.hpp"
 #include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
 #include "detail/transpose.hpp"

 namespace cv { namespace cudev {

-template <class SrcPtr, typename DstType>
-__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
    const int rows = getRows(src);
    const int cols = getCols(src);

    dst.create(cols, rows);

-    transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+    transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == cols && getCols(dst) == rows );
+
+    transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
+}
+
+// Default Policy
+
+struct DefaultTransposePolicy
+{
+    enum {
+        tile_dim    = 16,
+        block_dim_y = 16
+    };
+};
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
+}
+
+template <class SrcPtr, typename DstType>
+__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
 }

 }}
--- a/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/lut.hpp
@@ -47,6 +47,7 @@
 #define __OPENCV_CUDEV_PTR2D_LUT_HPP__

 #include "../common.hpp"
+#include "../util/vec_traits.hpp"
 #include "../grid/copy.hpp"
 #include "traits.hpp"
 #include "gpumat.hpp"
@@ -63,7 +64,8 @@ template <class SrcPtr, class TablePtr> struct LutPtr

    __device__ __forceinline__ typename PtrTraits<TablePtr>::value_type operator ()(typename PtrTraits<SrcPtr>::index_type y, typename PtrTraits<SrcPtr>::index_type x) const
    {
-        return tbl(0, src(y, x));
+        typedef typename PtrTraits<TablePtr>::index_type tbl_index_type;
+        return tbl(VecTraits<tbl_index_type>::all(0), src(y, x));
    }
 };

@@ -81,8 +83,6 @@ template <class SrcPtr, class TablePtr> struct LutPtrSz : LutPtr<SrcPtr, TablePt
 template <class SrcPtr, class TablePtr>
 __host__ LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> lutPtr(const SrcPtr& src, const TablePtr& tbl)
 {
-    CV_Assert( getRows(tbl) == 1 );
-
    LutPtrSz<typename PtrTraits<SrcPtr>::ptr_type, typename PtrTraits<TablePtr>::ptr_type> ptr;
    ptr.src = shrinkPtr(src);
    ptr.tbl = shrinkPtr(tbl);
--- a/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/mask.hpp
@@ -62,6 +62,42 @@ struct WithOutMask
    }
 };

+template <class MaskPtr> struct SingleMaskChannels
+{
+    typedef typename PtrTraits<MaskPtr>::value_type value_type;
+    typedef typename PtrTraits<MaskPtr>::index_type index_type;
+
+    MaskPtr mask;
+    int channels;
+
+    __device__ __forceinline__ value_type operator()(index_type y, index_type x) const
+    {
+        return mask(y, x / channels);
+    }
+
+};
+
+template <class MaskPtr> struct SingleMaskChannelsSz : SingleMaskChannels<MaskPtr>
+{
+    int rows, cols;
+};
+
+template <class MaskPtr>
+__host__ SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type>
+singleMaskChannels(const MaskPtr& mask, int channels)
+{
+    SingleMaskChannelsSz<typename PtrTraits<MaskPtr>::ptr_type> ptr;
+    ptr.mask = shrinkPtr(mask);
+    ptr.channels = channels;
+    ptr.rows = getRows(mask);
+    ptr.cols = getCols(mask) * channels;
+    return ptr;
+}
+
+template <class MaskPtr> struct PtrTraits< SingleMaskChannelsSz<MaskPtr> > : PtrTraitsBase<SingleMaskChannelsSz<MaskPtr>, SingleMaskChannels<MaskPtr> >
+{
+};
+
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_math.hpp
@@ -194,10 +194,23 @@ CV_CUDEV_IMPLEMENT_VEC_UNARY_OP(~, uint, uint)
        return VecTraits<output_type ## 4>::make(func (a.x), func (a.y), func (a.z), func (a.w)); \
    }

+namespace vec_math_detail
+{
+    __device__ __forceinline__ schar abs_(schar val)
+    {
+        return (schar) ::abs((int) val);
+    }
+
+    __device__ __forceinline__ short abs_(short val)
+    {
+        return (short) ::abs((int) val);
+    }
+}
+
 CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uchar, uchar)
-CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, char, char)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, char, char)
 CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, ushort, ushort)
-CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, short, short)
+CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, vec_math_detail::abs_, short, short)
 CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::abs, int, int)
 CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, /*::abs*/, uint, uint)
 CV_CUDEV_IMPLEMENT_VEC_UNARY_FUNC(abs, ::fabsf, float, float)
--- a/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/vec_traits.hpp
@@ -70,7 +70,7 @@ CV_CUDEV_MAKE_VEC_INST(double)

 #undef CV_CUDEV_MAKE_VEC_INST

-template<> struct MakeVec<schar, 1> { typedef char  type; };
+template<> struct MakeVec<schar, 1> { typedef schar type; };
 template<> struct MakeVec<schar, 2> { typedef char2 type; };
 template<> struct MakeVec<schar, 3> { typedef char3 type; };
 template<> struct MakeVec<schar, 4> { typedef char4 type; };