gpuarithm module fixes

2013-04-18 10:46:09 +04:00
parent 43d5e2d8b4
commit b4f3d08725
22 changed files with 7297 additions and 6900 deletions
--- a/modules/gpuarithm/src/arithm.cpp
+++ b/modules/gpuarithm/src/arithm.cpp
@@ -48,25 +48,17 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

 void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool, Stream&) { throw_no_cuda(); }
 void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, float, bool, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_no_cuda(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_no_cuda(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_no_cuda(); }
@@ -308,468 +300,6 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
 #endif
 }

-////////////////////////////////////////////////////////////////////////
-// transpose
-
-namespace arithm
-{
-    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
-}
-
-void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
-{
-    CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
-
-    dst.create( src.cols, src.rows, src.type() );
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    if (src.elemSize() == 1)
-    {
-        NppStreamHandler h(stream);
-
-        NppiSize sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else if (src.elemSize() == 4)
-    {
-        arithm::transpose<int>(src, dst, stream);
-    }
-    else // if (src.elemSize() == 8)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-
-        arithm::transpose<double>(src, dst, stream);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////
-// flip
-
-namespace
-{
-    template<int DEPTH> struct NppTypeTraits;
-    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
-    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
-    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
-    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
-    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
-    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
-    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
-
-    template <int DEPTH> struct NppMirrorFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
-    };
-
-    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
-    {
-        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
-                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
-                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
-{
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
-    static const func_t funcs[6][4] =
-    {
-        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
-        {0,0,0,0},
-        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
-        {0,0,0,0},
-        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
-        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
-    };
-
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
-
-    dst.create(src.size(), src.type());
-
-    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// LUT
-
-void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
-{
-    const int cn = src.channels();
-
-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
-    CV_Assert( lut.depth() == CV_8U );
-    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
-    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
-
-    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
-
-    NppiSize sz;
-    sz.height = src.rows;
-    sz.width = src.cols;
-
-    Mat nppLut;
-    lut.convertTo(nppLut, CV_32S);
-
-    int nValues3[] = {256, 256, 256};
-
-    Npp32s pLevels[256];
-    for (int i = 0; i < 256; ++i)
-        pLevels[i] = i;
-
-    const Npp32s* pLevels3[3];
-
-#if (CUDA_VERSION <= 4020)
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-    GpuMat d_pLevels;
-    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
-    {
-#if (CUDA_VERSION <= 4020)
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
-#else
-        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
-#endif
-    }
-    else
-    {
-        const Npp32s* pValues3[3];
-
-        Mat nppLut3[3];
-        if (nppLut.channels() == 1)
-        {
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
-#else
-            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
-#endif
-        }
-        else
-        {
-            cv::split(nppLut, nppLut3);
-
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = nppLut3[0].ptr<Npp32s>();
-            pValues3[1] = nppLut3[1].ptr<Npp32s>();
-            pValues3[2] = nppLut3[2].ptr<Npp32s>();
-#else
-            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
-            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
-            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
-
-            pValues3[0] = d_nppLut0.ptr<Npp32s>();
-            pValues3[1] = d_nppLut1.ptr<Npp32s>();
-            pValues3[2] = d_nppLut2.ptr<Npp32s>();
-#endif
-        }
-
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-////////////////////////////////////////////////////////////////////////
-// NPP magnitide
-
-namespace
-{
-    typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
-
-    inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
-    {
-        CV_Assert(src.type() == CV_32FC2);
-
-        dst.create(src.size(), CV_32FC1);
-
-        NppiSize sz;
-        sz.width = src.cols;
-        sz.height = src.rows;
-
-        NppStreamHandler h(stream);
-
-        nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// Polar <-> Cart
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mathfunc
-    {
-        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::mathfunc;
-
-        CV_Assert(x.size() == y.size() && x.type() == y.type());
-        CV_Assert(x.depth() == CV_32F);
-
-        if (mag)
-            mag->create(x.size(), x.type());
-        if (angle)
-            angle->create(x.size(), x.type());
-
-        GpuMat x1cn = x.reshape(1);
-        GpuMat y1cn = y.reshape(1);
-        GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
-        GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();
-
-        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
-    }
-
-    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::mathfunc;
-
-        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
-        CV_Assert(mag.depth() == CV_32F);
-
-        x.create(mag.size(), mag.type());
-        y.create(mag.size(), mag.type());
-
-        GpuMat mag1cn = mag.reshape(1);
-        GpuMat angle1cn = angle.reshape(1);
-        GpuMat x1cn = x.reshape(1);
-        GpuMat y1cn = y.reshape(1);
-
-        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
-    }
-}
-
-void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
-{
-    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
-{
-    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
-{
-    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
-{
-    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
-{
-    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// normalize
-
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
-{
-    GpuMat norm_buf;
-    GpuMat cvt_buf;
-    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
-}
-
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-{
-    double scale = 1, shift = 0;
-    if (norm_type == NORM_MINMAX)
-    {
-        double smin = 0, smax = 0;
-        double dmin = std::min(a, b), dmax = std::max(a, b);
-        gpu::minMax(src, &smin, &smax, mask, norm_buf);
-        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
-        shift = dmin - smin * scale;
-    }
-    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
-    {
-        scale = gpu::norm(src, norm_type, mask, norm_buf);
-        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
-        shift = 0;
-    }
-    else
-    {
-        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
-    }
-
-    if (mask.empty())
-    {
-        src.convertTo(dst, dtype, scale, shift);
-    }
-    else
-    {
-        src.convertTo(cvt_buf, dtype, scale, shift);
-        cvt_buf.copyTo(dst, mask);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////
-// copyMakeBorder
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::imgproc;
-
-        Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
-
-        copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
-    }
-}
-
-#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
-typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
-#else
-typedef Npp32s Npp32s_a;
-#endif
-
-void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
-{
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
-
-    dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
-    {
-        NppiSize srcsz;
-        srcsz.width  = src.cols;
-        srcsz.height = src.rows;
-
-        NppiSize dstsz;
-        dstsz.width  = dst.cols;
-        dstsz.height = dst.rows;
-
-        NppStreamHandler h(stream);
-
-        switch (src.type())
-        {
-        case CV_8UC1:
-            {
-                Npp8u nVal = saturate_cast<Npp8u>(value[0]);
-                nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        case CV_8UC4:
-            {
-                Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
-                nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        case CV_32SC1:
-            {
-                Npp32s nVal = saturate_cast<Npp32s>(value[0]);
-                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        case CV_32FC1:
-            {
-                Npp32f val = saturate_cast<Npp32f>(value[0]);
-                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
-                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
-                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
-                break;
-            }
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
-        static const caller_t callers[6][4] =
-        {
-            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
-            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
-            {   copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/,    copyMakeBorder_caller<ushort, 3> ,    copyMakeBorder_caller<ushort, 4>},
-            {   copyMakeBorder_caller<short, 1>  , 0/*copyMakeBorder_caller<short, 2>*/ ,    copyMakeBorder_caller<short, 3>  ,    copyMakeBorder_caller<short, 4>},
-            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
-            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
-        };
-
-        caller_t func = callers[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, dst, top, left, borderType, value, stream);
-    }
-}
-
 ////////////////////////////////////////////////////////////////////////
 // integral

--- a/modules/gpuarithm/src/core.cpp
+++ b/modules/gpuarithm/src/core.cpp
@@ -0,0 +1,488 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+void cv::gpu::merge(const std::vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+
+void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+void cv::gpu::split(const GpuMat& /*src*/, std::vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
+
+void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, int, const Scalar&, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// merge/split
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace split_merge
+    {
+        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
+        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
+    }
+}}}
+
+namespace
+{
+    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
+    {
+        using namespace ::cv::gpu::cudev::split_merge;
+
+        CV_Assert(src);
+        CV_Assert(n > 0);
+
+        int depth = src[0].depth();
+        Size size = src[0].size();
+
+        if (depth == CV_64F)
+        {
+            if (!deviceSupports(NATIVE_DOUBLE))
+                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        bool single_channel_only = true;
+        int total_channels = 0;
+
+        for (size_t i = 0; i < n; ++i)
+        {
+            CV_Assert(src[i].size() == size);
+            CV_Assert(src[i].depth() == depth);
+            single_channel_only = single_channel_only && src[i].channels() == 1;
+            total_channels += src[i].channels();
+        }
+
+        CV_Assert(single_channel_only);
+        CV_Assert(total_channels <= 4);
+
+        if (total_channels == 1)
+            src[0].copyTo(dst);
+        else
+        {
+            dst.create(size, CV_MAKETYPE(depth, total_channels));
+
+            PtrStepSzb src_as_devmem[4];
+            for(size_t i = 0; i < n; ++i)
+                src_as_devmem[i] = src[i];
+
+            PtrStepSzb dst_as_devmem(dst);
+            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
+        }
+    }
+
+    void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
+    {
+        using namespace ::cv::gpu::cudev::split_merge;
+
+        CV_Assert(dst);
+
+        int depth = src.depth();
+        int num_channels = src.channels();
+
+        if (depth == CV_64F)
+        {
+            if (!deviceSupports(NATIVE_DOUBLE))
+                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        if (num_channels == 1)
+        {
+            src.copyTo(dst[0]);
+            return;
+        }
+
+        for (int i = 0; i < num_channels; ++i)
+            dst[i].create(src.size(), depth);
+
+        CV_Assert(num_channels <= 4);
+
+        PtrStepSzb dst_as_devmem[4];
+        for (int i = 0; i < num_channels; ++i)
+            dst_as_devmem[i] = dst[i];
+
+        PtrStepSzb src_as_devmem(src);
+        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
+    }
+}
+
+void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
+{
+    ::merge(src, n, dst, StreamAccessor::getStream(stream));
+}
+
+
+void cv::gpu::merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream)
+{
+    ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
+{
+    ::split(src, dst, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream)
+{
+    dst.resize(src.channels());
+    if(src.channels() > 0)
+        ::split(src, &dst[0], StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// transpose
+
+namespace arithm
+{
+    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
+}
+
+void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
+{
+    CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
+
+    dst.create( src.cols, src.rows, src.type() );
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    if (src.elemSize() == 1)
+    {
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    else if (src.elemSize() == 4)
+    {
+        arithm::transpose<int>(src, dst, stream);
+    }
+    else // if (src.elemSize() == 8)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+
+        arithm::transpose<double>(src, dst, stream);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// flip
+
+namespace
+{
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
+
+    template <int DEPTH> struct NppMirrorFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+    };
+
+    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
+    {
+        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize sz;
+            sz.width  = src.cols;
+            sz.height = src.rows;
+
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
+        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
+    };
+
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
+    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+
+    dst.create(src.size(), src.type());
+
+    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// LUT
+
+void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
+{
+    const int cn = src.channels();
+
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+    CV_Assert( lut.depth() == CV_8U );
+    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
+    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
+
+    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
+
+    NppiSize sz;
+    sz.height = src.rows;
+    sz.width = src.cols;
+
+    Mat nppLut;
+    lut.convertTo(nppLut, CV_32S);
+
+    int nValues3[] = {256, 256, 256};
+
+    Npp32s pLevels[256];
+    for (int i = 0; i < 256; ++i)
+        pLevels[i] = i;
+
+    const Npp32s* pLevels3[3];
+
+#if (CUDA_VERSION <= 4020)
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+#else
+    GpuMat d_pLevels;
+    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+#endif
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStreamHandler h(stream);
+
+    if (src.type() == CV_8UC1)
+    {
+#if (CUDA_VERSION <= 4020)
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
+#else
+        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
+#endif
+    }
+    else
+    {
+        const Npp32s* pValues3[3];
+
+        Mat nppLut3[3];
+        if (nppLut.channels() == 1)
+        {
+#if (CUDA_VERSION <= 4020)
+            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
+#else
+            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
+            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
+#endif
+        }
+        else
+        {
+            cv::split(nppLut, nppLut3);
+
+#if (CUDA_VERSION <= 4020)
+            pValues3[0] = nppLut3[0].ptr<Npp32s>();
+            pValues3[1] = nppLut3[1].ptr<Npp32s>();
+            pValues3[2] = nppLut3[2].ptr<Npp32s>();
+#else
+            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
+            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
+            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
+
+            pValues3[0] = d_nppLut0.ptr<Npp32s>();
+            pValues3[1] = d_nppLut1.ptr<Npp32s>();
+            pValues3[2] = d_nppLut2.ptr<Npp32s>();
+#endif
+        }
+
+        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+    }
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+////////////////////////////////////////////////////////////////////////
+// copyMakeBorder
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
+    {
+        using namespace ::cv::gpu::cudev::imgproc;
+
+        Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
+
+        copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
+    }
+}
+
+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
+#else
+typedef Npp32s Npp32s_a;
+#endif
+
+void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom, int left, int right, int borderType, const Scalar& value, Stream& s)
+{
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
+
+    dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
+    {
+        NppiSize srcsz;
+        srcsz.width  = src.cols;
+        srcsz.height = src.rows;
+
+        NppiSize dstsz;
+        dstsz.width  = dst.cols;
+        dstsz.height = dst.rows;
+
+        NppStreamHandler h(stream);
+
+        switch (src.type())
+        {
+        case CV_8UC1:
+            {
+                Npp8u nVal = saturate_cast<Npp8u>(value[0]);
+                nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        case CV_8UC4:
+            {
+                Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
+                nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        case CV_32SC1:
+            {
+                Npp32s nVal = saturate_cast<Npp32s>(value[0]);
+                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        case CV_32FC1:
+            {
+                Npp32f val = saturate_cast<Npp32f>(value[0]);
+                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
+                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    else
+    {
+        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
+        static const caller_t callers[6][4] =
+        {
+            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
+            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
+            {   copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/,    copyMakeBorder_caller<ushort, 3> ,    copyMakeBorder_caller<ushort, 4>},
+            {   copyMakeBorder_caller<short, 1>  , 0/*copyMakeBorder_caller<short, 2>*/ ,    copyMakeBorder_caller<short, 3>  ,    copyMakeBorder_caller<short, 4>},
+            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
+            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
+        };
+
+        caller_t func = callers[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, top, left, borderType, value, stream);
+    }
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpuarithm/src/element_operations.cpp
+++ b/modules/gpuarithm/src/element_operations.cpp
@@ -49,39 +49,72 @@ using namespace cv::gpu;

 void cv::gpu::add(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::add(const GpuMat&, const Scalar&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::subtract(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::subtract(const GpuMat&, const Scalar&, GpuMat&, const GpuMat&, int, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::multiply(const GpuMat&, const GpuMat&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::multiply(const GpuMat&, const Scalar&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::divide(double, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::abs(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::sqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::sqrt(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::compare(const GpuMat&, Scalar, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::bitwise_or(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::bitwise_and(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::bitwise_xor(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::bitwise_xor(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::rshift(const GpuMat&, Scalar_<int>, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::lshift(const GpuMat&, Scalar_<int>, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::min(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::min(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::max(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::max(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::pow(const GpuMat&, double, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::addWeighted(const GpuMat&, double, const GpuMat&, double, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
+
 double cv::gpu::threshold(const GpuMat&, GpuMat&, double, double, int, Stream&) {throw_no_cuda(); return 0.0;}

+void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+
 #else

 ////////////////////////////////////////////////////////////////////////
@@ -3283,4 +3316,118 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    return thresh;
 }

+////////////////////////////////////////////////////////////////////////
+// NPP magnitide
+
+namespace
+{
+    typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
+
+    inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
+    {
+        CV_Assert(src.type() == CV_32FC2);
+
+        dst.create(src.size(), CV_32FC1);
+
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        NppStreamHandler h(stream);
+
+        nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// Polar <-> Cart
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace mathfunc
+    {
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
+    {
+        using namespace ::cv::gpu::cudev::mathfunc;
+
+        CV_Assert(x.size() == y.size() && x.type() == y.type());
+        CV_Assert(x.depth() == CV_32F);
+
+        if (mag)
+            mag->create(x.size(), x.type());
+        if (angle)
+            angle->create(x.size(), x.type());
+
+        GpuMat x1cn = x.reshape(1);
+        GpuMat y1cn = y.reshape(1);
+        GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
+        GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();
+
+        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
+    }
+
+    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
+    {
+        using namespace ::cv::gpu::cudev::mathfunc;
+
+        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
+        CV_Assert(mag.depth() == CV_32F);
+
+        x.create(mag.size(), mag.type());
+        y.create(mag.size(), mag.type());
+
+        GpuMat mag1cn = mag.reshape(1);
+        GpuMat angle1cn = angle.reshape(1);
+        GpuMat x1cn = x.reshape(1);
+        GpuMat y1cn = y.reshape(1);
+
+        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
+    }
+}
+
+void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
+{
+    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
+{
+    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
+{
+    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
+{
+    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
+{
+    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
+}
+
 #endif
--- a/modules/gpuarithm/src/precomp.hpp
+++ b/modules/gpuarithm/src/precomp.hpp
@@ -49,7 +49,6 @@

 #include "opencv2/gpuarithm.hpp"
 #include "opencv2/core/utility.hpp"
-#include "opencv2/core/core_c.h"

 #include "opencv2/core/gpu_private.hpp"

--- a/modules/gpuarithm/src/matrix_reductions.cpp
+++ b/modules/gpuarithm/src/matrix_reductions.cpp
@@ -47,30 +47,42 @@ using namespace cv::gpu;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_no_cuda(); }
-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
 double cv::gpu::norm(const GpuMat&, int) { throw_no_cuda(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_no_cuda(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_no_cuda(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_no_cuda(); return 0.0; }
+
 Scalar cv::gpu::sum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
 Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
 Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+
 Scalar cv::gpu::absSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
 Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
 Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+
 Scalar cv::gpu::sqrSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
 Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
+
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_no_cuda(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_no_cuda(); }
+
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_no_cuda(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
 int cv::gpu::countNonZero(const GpuMat&) { throw_no_cuda(); return 0; }
 int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_no_cuda(); return 0; }
+
 void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
+
+void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_no_cuda(); }
+void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
+
 void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_no_cuda(); }

+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
 #else

 namespace
@@ -109,46 +121,6 @@ namespace
    };
 }

-
-////////////////////////////////////////////////////////////////////////
-// meanStdDev
-
-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
-{
-    GpuMat buf;
-    meanStdDev(src, mean, stddev, buf);
-}
-
-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
-        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
-
-    NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
-
-    DeviceBuffer dbuf(2);
-
-    int bufSize;
-#if (CUDA_VERSION <= 4020)
-    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
-#else
-    nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
-#endif
-
-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
-
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
-
-    cudaSafeCall( cudaDeviceSynchronize() );
-
-    double* ptrs[2] = {mean.val, stddev.val};
-    dbuf.download(ptrs);
-}
-
 ////////////////////////////////////////////////////////////////////////
 // norm

@@ -697,6 +669,45 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    }
 }

+////////////////////////////////////////////////////////////////////////
+// meanStdDev
+
+void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
+{
+    GpuMat buf;
+    meanStdDev(src, mean, stddev, buf);
+}
+
+void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
+{
+    CV_Assert(src.type() == CV_8UC1);
+
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
+        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+
+    NppiSize sz;
+    sz.width  = src.cols;
+    sz.height = src.rows;
+
+    DeviceBuffer dbuf(2);
+
+    int bufSize;
+#if (CUDA_VERSION <= 4020)
+    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
+#else
+    nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
+#endif
+
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
+
+    cudaSafeCall( cudaDeviceSynchronize() );
+
+    double* ptrs[2] = {mean.val, stddev.val};
+    dbuf.download(ptrs);
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rectStdDev

@@ -727,4 +738,47 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
        cudaSafeCall( cudaDeviceSynchronize() );
 }

+////////////////////////////////////////////////////////////////////////
+// normalize
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
+{
+    GpuMat norm_buf;
+    GpuMat cvt_buf;
+    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
+}
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+{
+    double scale = 1, shift = 0;
+    if (norm_type == NORM_MINMAX)
+    {
+        double smin = 0, smax = 0;
+        double dmin = std::min(a, b), dmax = std::max(a, b);
+        gpu::minMax(src, &smin, &smax, mask, norm_buf);
+        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        shift = dmin - smin * scale;
+    }
+    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
+    {
+        scale = gpu::norm(src, norm_type, mask, norm_buf);
+        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
+        shift = 0;
+    }
+    else
+    {
+        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
+    }
+
+    if (mask.empty())
+    {
+        src.convertTo(dst, dtype, scale, shift);
+    }
+    else
+    {
+        src.convertTo(cvt_buf, dtype, scale, shift);
+        cvt_buf.copyTo(dst, mask);
+    }
+}
+
 #endif
--- a/modules/gpuarithm/src/split_merge.cpp
+++ b/modules/gpuarithm/src/split_merge.cpp
@@ -1,171 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::merge(const std::vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::split(const GpuMat& /*src*/, std::vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace split_merge
-    {
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
-    }
-}}}
-
-namespace
-{
-    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
-    {
-        using namespace ::cv::gpu::cudev::split_merge;
-
-        CV_Assert(src);
-        CV_Assert(n > 0);
-
-        int depth = src[0].depth();
-        Size size = src[0].size();
-
-        if (depth == CV_64F)
-        {
-            if (!deviceSupports(NATIVE_DOUBLE))
-                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-        }
-
-        bool single_channel_only = true;
-        int total_channels = 0;
-
-        for (size_t i = 0; i < n; ++i)
-        {
-            CV_Assert(src[i].size() == size);
-            CV_Assert(src[i].depth() == depth);
-            single_channel_only = single_channel_only && src[i].channels() == 1;
-            total_channels += src[i].channels();
-        }
-
-        CV_Assert(single_channel_only);
-        CV_Assert(total_channels <= 4);
-
-        if (total_channels == 1)
-            src[0].copyTo(dst);
-        else
-        {
-            dst.create(size, CV_MAKETYPE(depth, total_channels));
-
-            PtrStepSzb src_as_devmem[4];
-            for(size_t i = 0; i < n; ++i)
-                src_as_devmem[i] = src[i];
-
-            PtrStepSzb dst_as_devmem(dst);
-            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
-        }
-    }
-
-    void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
-    {
-        using namespace ::cv::gpu::cudev::split_merge;
-
-        CV_Assert(dst);
-
-        int depth = src.depth();
-        int num_channels = src.channels();
-
-        if (depth == CV_64F)
-        {
-            if (!deviceSupports(NATIVE_DOUBLE))
-                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-        }
-
-        if (num_channels == 1)
-        {
-            src.copyTo(dst[0]);
-            return;
-        }
-
-        for (int i = 0; i < num_channels; ++i)
-            dst[i].create(src.size(), depth);
-
-        CV_Assert(num_channels <= 4);
-
-        PtrStepSzb dst_as_devmem[4];
-        for (int i = 0; i < num_channels; ++i)
-            dst_as_devmem[i] = dst[i];
-
-        PtrStepSzb src_as_devmem(src);
-        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
-    }
-}
-
-void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
-{
-    ::merge(src, n, dst, StreamAccessor::getStream(stream));
-}
-
-
-void cv::gpu::merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream)
-{
-    ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
-{
-    ::split(src, dst, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream)
-{
-    dst.resize(src.channels());
-    if(src.channels() > 0)
-        ::split(src, &dst[0], StreamAccessor::getStream(stream));
-}
-
-#endif /* !defined (HAVE_CUDA) */