moved GpuMat and DevMem2D to core module, some code refactoring

2011-11-09 13:13:52 +00:00
parent 8a148e39f0
commit fcfa72081e
95 changed files with 18889 additions and 18485 deletions
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -42,465 +42,467 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace split_merge {
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T, size_t elem_size = sizeof(T)>
-    struct TypeTraits 
+namespace split_merge {
+
+template <typename T, size_t elem_size = sizeof(T)>
+struct TypeTraits 
+{
+    typedef T type;
+    typedef T type2;
+    typedef T type3;
+    typedef T type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 1>
+{
+    typedef char type;
+    typedef char2 type2;
+    typedef char3 type3;
+    typedef char4 type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 2>
+{
+    typedef short type;
+    typedef short2 type2;
+    typedef short3 type3;
+    typedef short4 type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 4> 
+{
+    typedef int type;
+    typedef int2 type2;
+    typedef int3 type3;
+    typedef int4 type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 8> 
+{
+    typedef double type;
+    typedef double2 type2;
+    //typedef double3 type3;
+    //typedef double4 type3;
+};
+
+typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
+typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
+
+//------------------------------------------------------------
+// Merge    
+
+template <typename T>
+__global__ void mergeC2_(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    typedef typename TypeTraits<T>::type2 dst_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const T* src0_y = (const T*)(src0 + y * src0_step);
+    const T* src1_y = (const T*)(src1 + y * src1_step);
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_type dst_elem;
+        dst_elem.x = src0_y[x];
+        dst_elem.y = src1_y[x];
+        dst_y[x] = dst_elem;
+    }
+}
+
+
+template <typename T>
+__global__ void mergeC3_(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    typedef typename TypeTraits<T>::type3 dst_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const T* src0_y = (const T*)(src0 + y * src0_step);
+    const T* src1_y = (const T*)(src1 + y * src1_step);
+    const T* src2_y = (const T*)(src2 + y * src2_step);
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_type dst_elem;
+        dst_elem.x = src0_y[x];
+        dst_elem.y = src1_y[x];
+        dst_elem.z = src2_y[x];
+        dst_y[x] = dst_elem;
+    }
+}
+
+
+template <>
+__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double* src0_y = (const double*)(src0 + y * src0_step);
+    const double* src1_y = (const double*)(src1 + y * src1_step);
+    const double* src2_y = (const double*)(src2 + y * src2_step);
+    double* dst_y = (double*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_y[3 * x] = src0_y[x];
+        dst_y[3 * x + 1] = src1_y[x];
+        dst_y[3 * x + 2] = src2_y[x];
+    }
+}
+
+
+template <typename T>
+__global__ void mergeC4_(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         const uchar* src3, size_t src3_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    typedef typename TypeTraits<T>::type4 dst_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const T* src0_y = (const T*)(src0 + y * src0_step);
+    const T* src1_y = (const T*)(src1 + y * src1_step);
+    const T* src2_y = (const T*)(src2 + y * src2_step);
+    const T* src3_y = (const T*)(src3 + y * src3_step);
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_type dst_elem;
+        dst_elem.x = src0_y[x];
+        dst_elem.y = src1_y[x];
+        dst_elem.z = src2_y[x];
+        dst_elem.w = src3_y[x];
+        dst_y[x] = dst_elem;
+    }
+}
+
+
+template <>
+__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         const uchar* src3, size_t src3_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double* src0_y = (const double*)(src0 + y * src0_step);
+    const double* src1_y = (const double*)(src1 + y * src1_step);
+    const double* src2_y = (const double*)(src2 + y * src2_step);
+    const double* src3_y = (const double*)(src3 + y * src3_step);
+    double2* dst_y = (double2*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
+        dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
+    }
+}
+
+
+template <typename T>
+static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+    mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            src[0].data, src[0].step,
+            src[1].data, src[1].step,
+            dst.rows, dst.cols, dst.data, dst.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+    mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            src[0].data, src[0].step,
+            src[1].data, src[1].step,
+            src[2].data, src[2].step,
+            dst.rows, dst.cols, dst.data, dst.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+    mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            src[0].data, src[0].step,
+            src[1].data, src[1].step,
+            src[2].data, src[2].step,
+            src[3].data, src[3].step,
+            dst.rows, dst.cols, dst.data, dst.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
+                             int total_channels, size_t elem_size,
+                             const cudaStream_t& stream)
+{
+    static MergeFunction merge_func_tbl[] =
    {
-        typedef T type;
-        typedef T type2;
-        typedef T type3;
-        typedef T type4;
+        mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
+        mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
+        mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
    };

-    template <typename T>
-    struct TypeTraits<T, 1>
+    size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
+    MergeFunction merge_func = merge_func_tbl[merge_func_id];
+
+    if (merge_func == 0)
+        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+
+    merge_func(src, dst, stream);
+}
+
+
+
+//------------------------------------------------------------
+// Split
+
+
+template <typename T>
+__global__ void splitC2_(const uchar* src, size_t src_step, 
+                        int rows, int cols,
+                        uchar* dst0, size_t dst0_step,
+                        uchar* dst1, size_t dst1_step)
+{
+    typedef typename TypeTraits<T>::type2 src_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const src_type* src_y = (const src_type*)(src + y * src_step);
+    T* dst0_y = (T*)(dst0 + y * dst0_step);
+    T* dst1_y = (T*)(dst1 + y * dst1_step);
+
+    if (x < cols && y < rows) 
    {
-        typedef char type;
-        typedef char2 type2;
-        typedef char3 type3;
-        typedef char4 type4;
+        src_type src_elem = src_y[x];
+        dst0_y[x] = src_elem.x;
+        dst1_y[x] = src_elem.y;
+    }
+}
+
+
+template <typename T>
+__global__ void splitC3_(const uchar* src, size_t src_step, 
+                        int rows, int cols,
+                        uchar* dst0, size_t dst0_step,
+                        uchar* dst1, size_t dst1_step,
+                        uchar* dst2, size_t dst2_step)
+{
+    typedef typename TypeTraits<T>::type3 src_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const src_type* src_y = (const src_type*)(src + y * src_step);
+    T* dst0_y = (T*)(dst0 + y * dst0_step);
+    T* dst1_y = (T*)(dst1 + y * dst1_step);
+    T* dst2_y = (T*)(dst2 + y * dst2_step);
+
+    if (x < cols && y < rows) 
+    {
+        src_type src_elem = src_y[x];
+        dst0_y[x] = src_elem.x;
+        dst1_y[x] = src_elem.y;
+        dst2_y[x] = src_elem.z;
+    }
+}
+
+
+template <>
+__global__ void splitC3_<double>(
+        const uchar* src, size_t src_step, int rows, int cols,
+        uchar* dst0, size_t dst0_step,
+        uchar* dst1, size_t dst1_step,
+        uchar* dst2, size_t dst2_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double* src_y = (const double*)(src + y * src_step);
+    double* dst0_y = (double*)(dst0 + y * dst0_step);
+    double* dst1_y = (double*)(dst1 + y * dst1_step);
+    double* dst2_y = (double*)(dst2 + y * dst2_step);
+
+    if (x < cols && y < rows) 
+    {
+        dst0_y[x] = src_y[3 * x];
+        dst1_y[x] = src_y[3 * x + 1];
+        dst2_y[x] = src_y[3 * x + 2];
+    }
+}
+
+
+template <typename T>
+__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
+                        uchar* dst0, size_t dst0_step,
+                        uchar* dst1, size_t dst1_step,
+                        uchar* dst2, size_t dst2_step,
+                        uchar* dst3, size_t dst3_step)
+{
+    typedef typename TypeTraits<T>::type4 src_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const src_type* src_y = (const src_type*)(src + y * src_step);
+    T* dst0_y = (T*)(dst0 + y * dst0_step);
+    T* dst1_y = (T*)(dst1 + y * dst1_step);
+    T* dst2_y = (T*)(dst2 + y * dst2_step);
+    T* dst3_y = (T*)(dst3 + y * dst3_step);
+
+    if (x < cols && y < rows) 
+    {
+        src_type src_elem = src_y[x];
+        dst0_y[x] = src_elem.x;
+        dst1_y[x] = src_elem.y;
+        dst2_y[x] = src_elem.z;
+        dst3_y[x] = src_elem.w;
+    }
+}
+
+
+template <>
+__global__ void splitC4_<double>(
+        const uchar* src, size_t src_step, int rows, int cols,
+        uchar* dst0, size_t dst0_step,
+        uchar* dst1, size_t dst1_step,
+        uchar* dst2, size_t dst2_step,
+        uchar* dst3, size_t dst3_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double2* src_y = (const double2*)(src + y * src_step);
+    double* dst0_y = (double*)(dst0 + y * dst0_step);
+    double* dst1_y = (double*)(dst1 + y * dst1_step);
+    double* dst2_y = (double*)(dst2 + y * dst2_step);
+    double* dst3_y = (double*)(dst3 + y * dst3_step);
+
+    if (x < cols && y < rows) 
+    {
+        double2 src_elem1 = src_y[2 * x];
+        double2 src_elem2 = src_y[2 * x + 1];
+        dst0_y[x] = src_elem1.x;
+        dst1_y[x] = src_elem1.y;
+        dst2_y[x] = src_elem2.x;
+        dst3_y[x] = src_elem2.y;
+    }
+}
+
+template <typename T>
+static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+    splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            src.data, src.step, src.rows, src.cols,
+            dst[0].data, dst[0].step,
+            dst[1].data, dst[1].step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+    splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            src.data, src.step, src.rows, src.cols,
+            dst[0].data, dst[0].step,
+            dst[1].data, dst[1].step,
+            dst[2].data, dst[2].step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+    splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+             src.data, src.step, src.rows, src.cols,
+             dst[0].data, dst[0].step,
+             dst[1].data, dst[1].step,
+             dst[2].data, dst[2].step,
+             dst[3].data, dst[3].step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+{
+    static SplitFunction split_func_tbl[] =
+    {
+        splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
+        splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
+        splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
    };

-    template <typename T>
-    struct TypeTraits<T, 2>
-    {
-        typedef short type;
-        typedef short2 type2;
-        typedef short3 type3;
-        typedef short4 type4;
-    };
+    size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
+    SplitFunction split_func = split_func_tbl[split_func_id];

-    template <typename T>
-    struct TypeTraits<T, 4> 
-    {
-        typedef int type;
-        typedef int2 type2;
-        typedef int3 type3;
-        typedef int4 type4;
-    };
+    if (split_func == 0)
+        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);

-    template <typename T>
-    struct TypeTraits<T, 8> 
-    {
-        typedef double type;
-        typedef double2 type2;
-        //typedef double3 type3;
-        //typedef double4 type3;
-    };
+    split_func(src, dst, stream);
+}

-    typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
-    typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
+} // namespace split_merge

-    //------------------------------------------------------------
-    // Merge    
-
-    template <typename T>
-    __global__ void mergeC2_(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        typedef typename TypeTraits<T>::type2 dst_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const T* src0_y = (const T*)(src0 + y * src0_step);
-        const T* src1_y = (const T*)(src1 + y * src1_step);
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_type dst_elem;
-            dst_elem.x = src0_y[x];
-            dst_elem.y = src1_y[x];
-            dst_y[x] = dst_elem;
-        }
-    }
-
-
-    template <typename T>
-    __global__ void mergeC3_(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        typedef typename TypeTraits<T>::type3 dst_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const T* src0_y = (const T*)(src0 + y * src0_step);
-        const T* src1_y = (const T*)(src1 + y * src1_step);
-        const T* src2_y = (const T*)(src2 + y * src2_step);
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_type dst_elem;
-            dst_elem.x = src0_y[x];
-            dst_elem.y = src1_y[x];
-            dst_elem.z = src2_y[x];
-            dst_y[x] = dst_elem;
-        }
-    }
-
-
-    template <>
-    __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double* src0_y = (const double*)(src0 + y * src0_step);
-        const double* src1_y = (const double*)(src1 + y * src1_step);
-        const double* src2_y = (const double*)(src2 + y * src2_step);
-        double* dst_y = (double*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_y[3 * x] = src0_y[x];
-            dst_y[3 * x + 1] = src1_y[x];
-            dst_y[3 * x + 2] = src2_y[x];
-        }
-    }
-
-
-    template <typename T>
-    __global__ void mergeC4_(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             const uchar* src3, size_t src3_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        typedef typename TypeTraits<T>::type4 dst_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const T* src0_y = (const T*)(src0 + y * src0_step);
-        const T* src1_y = (const T*)(src1 + y * src1_step);
-        const T* src2_y = (const T*)(src2 + y * src2_step);
-        const T* src3_y = (const T*)(src3 + y * src3_step);
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_type dst_elem;
-            dst_elem.x = src0_y[x];
-            dst_elem.y = src1_y[x];
-            dst_elem.z = src2_y[x];
-            dst_elem.w = src3_y[x];
-            dst_y[x] = dst_elem;
-        }
-    }
-
-
-    template <>
-    __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             const uchar* src3, size_t src3_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double* src0_y = (const double*)(src0 + y * src0_step);
-        const double* src1_y = (const double*)(src1 + y * src1_step);
-        const double* src2_y = (const double*)(src2 + y * src2_step);
-        const double* src3_y = (const double*)(src3 + y * src3_step);
-        double2* dst_y = (double2*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
-            dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
-        }
-    }
-
-
-    template <typename T>
-    static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-        mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
-                src[0].data, src[0].step,
-                src[1].data, src[1].step,
-                dst.rows, dst.cols, dst.data, dst.step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-        mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
-                src[0].data, src[0].step,
-                src[1].data, src[1].step,
-                src[2].data, src[2].step,
-                dst.rows, dst.cols, dst.data, dst.step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-        mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
-                src[0].data, src[0].step,
-                src[1].data, src[1].step,
-                src[2].data, src[2].step,
-                src[3].data, src[3].step,
-                dst.rows, dst.cols, dst.data, dst.step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    extern "C" void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
-                                 int total_channels, size_t elem_size,
-                                 const cudaStream_t& stream)
-    {
-        static MergeFunction merge_func_tbl[] =
-        {
-            mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
-            mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
-            mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
-        };
-
-        size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
-        MergeFunction merge_func = merge_func_tbl[merge_func_id];
-
-        if (merge_func == 0)
-            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
-
-        merge_func(src, dst, stream);
-    }
-
-
-
-    //------------------------------------------------------------
-    // Split
-
-
-    template <typename T>
-    __global__ void splitC2_(const uchar* src, size_t src_step, 
-                            int rows, int cols,
-                            uchar* dst0, size_t dst0_step,
-                            uchar* dst1, size_t dst1_step)
-    {
-        typedef typename TypeTraits<T>::type2 src_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const src_type* src_y = (const src_type*)(src + y * src_step);
-        T* dst0_y = (T*)(dst0 + y * dst0_step);
-        T* dst1_y = (T*)(dst1 + y * dst1_step);
-
-        if (x < cols && y < rows) 
-        {
-            src_type src_elem = src_y[x];
-            dst0_y[x] = src_elem.x;
-            dst1_y[x] = src_elem.y;
-        }
-    }
-
-
-    template <typename T>
-    __global__ void splitC3_(const uchar* src, size_t src_step, 
-                            int rows, int cols,
-                            uchar* dst0, size_t dst0_step,
-                            uchar* dst1, size_t dst1_step,
-                            uchar* dst2, size_t dst2_step)
-    {
-        typedef typename TypeTraits<T>::type3 src_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const src_type* src_y = (const src_type*)(src + y * src_step);
-        T* dst0_y = (T*)(dst0 + y * dst0_step);
-        T* dst1_y = (T*)(dst1 + y * dst1_step);
-        T* dst2_y = (T*)(dst2 + y * dst2_step);
-
-        if (x < cols && y < rows) 
-        {
-            src_type src_elem = src_y[x];
-            dst0_y[x] = src_elem.x;
-            dst1_y[x] = src_elem.y;
-            dst2_y[x] = src_elem.z;
-        }
-    }
-
-
-    template <>
-    __global__ void splitC3_<double>(
-            const uchar* src, size_t src_step, int rows, int cols,
-            uchar* dst0, size_t dst0_step,
-            uchar* dst1, size_t dst1_step,
-            uchar* dst2, size_t dst2_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double* src_y = (const double*)(src + y * src_step);
-        double* dst0_y = (double*)(dst0 + y * dst0_step);
-        double* dst1_y = (double*)(dst1 + y * dst1_step);
-        double* dst2_y = (double*)(dst2 + y * dst2_step);
-
-        if (x < cols && y < rows) 
-        {
-            dst0_y[x] = src_y[3 * x];
-            dst1_y[x] = src_y[3 * x + 1];
-            dst2_y[x] = src_y[3 * x + 2];
-        }
-    }
-
-
-    template <typename T>
-    __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
-                            uchar* dst0, size_t dst0_step,
-                            uchar* dst1, size_t dst1_step,
-                            uchar* dst2, size_t dst2_step,
-                            uchar* dst3, size_t dst3_step)
-    {
-        typedef typename TypeTraits<T>::type4 src_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const src_type* src_y = (const src_type*)(src + y * src_step);
-        T* dst0_y = (T*)(dst0 + y * dst0_step);
-        T* dst1_y = (T*)(dst1 + y * dst1_step);
-        T* dst2_y = (T*)(dst2 + y * dst2_step);
-        T* dst3_y = (T*)(dst3 + y * dst3_step);
-
-        if (x < cols && y < rows) 
-        {
-            src_type src_elem = src_y[x];
-            dst0_y[x] = src_elem.x;
-            dst1_y[x] = src_elem.y;
-            dst2_y[x] = src_elem.z;
-            dst3_y[x] = src_elem.w;
-        }
-    }
-
-
-    template <>
-    __global__ void splitC4_<double>(
-            const uchar* src, size_t src_step, int rows, int cols,
-            uchar* dst0, size_t dst0_step,
-            uchar* dst1, size_t dst1_step,
-            uchar* dst2, size_t dst2_step,
-            uchar* dst3, size_t dst3_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double2* src_y = (const double2*)(src + y * src_step);
-        double* dst0_y = (double*)(dst0 + y * dst0_step);
-        double* dst1_y = (double*)(dst1 + y * dst1_step);
-        double* dst2_y = (double*)(dst2 + y * dst2_step);
-        double* dst3_y = (double*)(dst3 + y * dst3_step);
-
-        if (x < cols && y < rows) 
-        {
-            double2 src_elem1 = src_y[2 * x];
-            double2 src_elem2 = src_y[2 * x + 1];
-            dst0_y[x] = src_elem1.x;
-            dst1_y[x] = src_elem1.y;
-            dst2_y[x] = src_elem2.x;
-            dst3_y[x] = src_elem2.y;
-        }
-    }
-
-    template <typename T>
-    static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-        splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
-                src.data, src.step, src.rows, src.cols,
-                dst[0].data, dst[0].step,
-                dst[1].data, dst[1].step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-        splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
-                src.data, src.step, src.rows, src.cols,
-                dst[0].data, dst[0].step,
-                dst[1].data, dst[1].step,
-                dst[2].data, dst[2].step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-        splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
-                 src.data, src.step, src.rows, src.cols,
-                 dst[0].data, dst[0].step,
-                 dst[1].data, dst[1].step,
-                 dst[2].data, dst[2].step,
-                 dst[3].data, dst[3].step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    extern "C" void split_caller(const DevMem2Db& src, DevMem2Db* dst,
-                                 int num_channels, size_t elem_size1,
-                                 const cudaStream_t& stream)
-    {
-        static SplitFunction split_func_tbl[] =
-        {
-            splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
-            splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
-            splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
-        };
-
-        size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
-        SplitFunction split_func = split_func_tbl[split_func_id];
-
-        if (split_func == 0)
-            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
-
-        split_func(src, dst, stream);
-    }
-
-}}} // namespace cv::gpu::split_merge
+END_OPENCV_DEVICE_NAMESPACE