renamed gpuarithm -> cudaarithm

2013-07-23 15:24:10 +04:00
parent 6d216d78d9
commit dcd600cc47
101 changed files with 72 additions and 72 deletions
--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@@ -0,0 +1,29 @@
+if(ANDROID OR IOS)
+  ocv_module_disable(cudaarithm)
+endif()
+
+set(the_description "CUDA-accelerated Operations on Matrices")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-declarations)
+
+ocv_add_module(cudaarithm opencv_core OPTIONAL opencv_cudalegacy)
+
+ocv_module_include_directories()
+ocv_glob_module_sources()
+
+set(extra_libs "")
+
+if(HAVE_CUBLAS)
+  list(APPEND extra_libs ${CUDA_cublas_LIBRARY})
+endif()
+
+if(HAVE_CUFFT)
+  list(APPEND extra_libs ${CUDA_cufft_LIBRARY})
+endif()
+
+ocv_create_module(${extra_libs})
+
+ocv_add_precompiled_headers(${the_module})
+
+ocv_add_accuracy_tests(DEPENDS_ON opencv_imgproc)
+ocv_add_perf_tests(DEPENDS_ON opencv_imgproc)
--- a/modules/cudaarithm/doc/arithm.rst
+++ b/modules/cudaarithm/doc/arithm.rst
@@ -0,0 +1,168 @@
+Arithm Operations on Matrices
+=============================
+
+.. highlight:: cpp
+
+
+
+gpu::gemm
+---------
+Performs generalized matrix multiplication.
+
+.. ocv:function:: void gpu::gemm(InputArray src1, InputArray src2, double alpha, InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null())
+
+    :param src1: First multiplied input matrix that should have  ``CV_32FC1`` , ``CV_64FC1`` , ``CV_32FC2`` , or  ``CV_64FC2``  type.
+
+    :param src2: Second multiplied input matrix of the same type as  ``src1`` .
+
+    :param alpha: Weight of the matrix product.
+
+    :param src3: Third optional delta matrix added to the matrix product. It should have the same type as  ``src1``  and  ``src2`` .
+
+    :param beta: Weight of  ``src3`` .
+
+    :param dst: Destination matrix. It has the proper size and the same type as input matrices.
+
+    :param flags: Operation flags:
+
+            * **GEMM_1_T** transpose  ``src1``
+            * **GEMM_2_T** transpose  ``src2``
+            * **GEMM_3_T** transpose  ``src3``
+
+    :param stream: Stream for the asynchronous version.
+
+The function performs generalized matrix multiplication similar to the ``gemm`` functions in BLAS level 3. For example, ``gemm(src1, src2, alpha, src3, beta, dst, GEMM_1_T + GEMM_3_T)`` corresponds to
+
+.. math::
+
+    \texttt{dst} =  \texttt{alpha} \cdot \texttt{src1} ^T  \cdot \texttt{src2} +  \texttt{beta} \cdot \texttt{src3} ^T
+
+.. note:: Transposition operation doesn't support  ``CV_64FC2``  input type.
+
+.. seealso:: :ocv:func:`gemm`
+
+
+
+gpu::mulSpectrums
+-----------------
+Performs a per-element multiplication of two Fourier spectrums.
+
+.. ocv:function:: void gpu::mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null())
+
+    :param src1: First spectrum.
+
+    :param src2: Second spectrum with the same size and type as  ``a`` .
+
+    :param dst: Destination spectrum.
+
+    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
+
+    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
+
+    :param stream: Stream for the asynchronous version.
+
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+
+.. seealso:: :ocv:func:`mulSpectrums`
+
+
+
+gpu::mulAndScaleSpectrums
+-------------------------
+Performs a per-element multiplication of two Fourier spectrums and scales the result.
+
+.. ocv:function:: void gpu::mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null())
+
+    :param src1: First spectrum.
+
+    :param src2: Second spectrum with the same size and type as  ``a`` .
+
+    :param dst: Destination spectrum.
+
+    :param flags: Mock parameter used for CPU/GPU interfaces similarity.
+
+    :param scale: Scale constant.
+
+    :param conjB: Optional flag to specify if the second spectrum needs to be conjugated before the multiplication.
+
+Only full (not packed) ``CV_32FC2`` complex spectrums in the interleaved format are supported for now.
+
+.. seealso:: :ocv:func:`mulSpectrums`
+
+
+
+gpu::dft
+--------
+Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.
+
+.. ocv:function:: void gpu::dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null())
+
+    :param src: Source matrix (real or complex).
+
+    :param dst: Destination matrix (real or complex).
+
+    :param dft_size: Size of a discrete Fourier transform.
+
+    :param flags: Optional flags:
+
+        * **DFT_ROWS** transforms each individual row of the source matrix.
+
+        * **DFT_SCALE** scales the result: divide it by the number of elements in the transform (obtained from  ``dft_size`` ).
+
+        * **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real cases are always forward and inverse, respectively).
+
+        * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.
+
+Use to handle real matrices ( ``CV32FC1`` ) and complex matrices in the interleaved format ( ``CV32FC2`` ).
+
+The source matrix should be continuous, otherwise reallocation and data copying is performed. The function chooses an operation mode depending on the flags, size, and channel count of the source matrix:
+
+    * If the source matrix is complex and the output is not specified as real, the destination matrix is complex and has the ``dft_size``    size and ``CV_32FC2``    type. The destination matrix contains a full result of the DFT (forward or inverse).
+
+    * If the source matrix is complex and the output is specified as real, the function assumes that its input is the result of the forward transform (see the next item). The destination matrix has the ``dft_size`` size and ``CV_32FC1`` type. It contains the result of the inverse DFT.
+
+    * If the source matrix is real (its type is ``CV_32FC1`` ), forward DFT is performed. The result of the DFT is packed into complex ( ``CV_32FC2`` ) matrix. So, the width of the destination matrix is ``dft_size.width / 2 + 1`` . But if the source is a single column, the height is reduced instead of the width.
+
+.. seealso:: :ocv:func:`dft`
+
+
+
+gpu::Convolution
+----------------
+.. ocv:class:: gpu::Convolution : public Algorithm
+
+Base class for convolution (or cross-correlation) operator. ::
+
+    class CV_EXPORTS Convolution : public Algorithm
+    {
+    public:
+        virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
+    };
+
+
+
+gpu::Convolution::convolve
+---------------------------
+Computes a convolution (or cross-correlation) of two images.
+
+.. ocv:function:: void gpu::Convolution::convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null())
+
+    :param image: Source image. Only  ``CV_32FC1`` images are supported for now.
+
+    :param templ: Template image. The size is not greater than the  ``image`` size. The type is the same as  ``image`` .
+
+    :param result: Result image. If  ``image`` is  *W x H*  and ``templ`` is  *w x h*, then  ``result`` must be *W-w+1 x H-h+1*.
+
+    :param ccorr: Flags to evaluate cross-correlation instead of convolution.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::createConvolution
+----------------------
+Creates implementation for :ocv:class:`gpu::Convolution` .
+
+.. ocv:function:: Ptr<Convolution> createConvolution(Size user_block_size = Size())
+
+    :param user_block_size: Block size. If you leave default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
--- a/modules/cudaarithm/doc/core.rst
+++ b/modules/cudaarithm/doc/core.rst
@@ -0,0 +1,150 @@
+Core Operations on Matrices
+===========================
+
+.. highlight:: cpp
+
+
+
+gpu::merge
+----------
+Makes a multi-channel matrix out of several single-channel matrices.
+
+.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Array/vector of source matrices.
+
+    :param n: Number of source matrices.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`merge`
+
+
+
+gpu::split
+----------
+Copies each plane of a multi-channel matrix into an array.
+
+.. ocv:function:: void gpu::split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::split(InputArray src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination array/vector of single-channel matrices.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`split`
+
+
+
+gpu::transpose
+--------------
+Transposes a matrix.
+
+.. ocv:function:: void gpu::transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`transpose`
+
+
+
+gpu::flip
+---------
+Flips a 2D matrix around vertical, horizontal, or both axes.
+
+.. ocv:function:: void gpu::flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
+
+    :param dst: Destination matrix.
+
+    :param flipCode: Flip mode for the source:
+
+        * ``0`` Flips around x-axis.
+
+        * ``> 0`` Flips around y-axis.
+
+        * ``< 0`` Flips around both axes.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`flip`
+
+
+
+gpu::LookUpTable
+----------------
+.. ocv:class:: gpu::LookUpTable : public Algorithm
+
+Base class for transform using lookup table. ::
+
+    class CV_EXPORTS LookUpTable : public Algorithm
+    {
+    public:
+        virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+    };
+
+.. seealso:: :ocv:func:`LUT`
+
+
+
+gpu::LookUpTable::transform
+---------------------------
+Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))`` .
+
+.. ocv:function:: void gpu::LookUpTable::transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::createLookUpTable
+----------------------
+Creates implementation for :ocv:class:`gpu::LookUpTable` .
+
+.. ocv:function:: Ptr<LookUpTable> createLookUpTable(InputArray lut)
+
+    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
+
+
+
+gpu::copyMakeBorder
+-----------------------
+Forms a border around an image.
+
+.. ocv:function:: void gpu::copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType, Scalar value = Scalar(), Stream& stream = Stream::Null())
+
+    :param src: Source image. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_32SC1`` , and ``CV_32FC1`` types are supported.
+
+    :param dst: Destination image with the same type as  ``src``. The size is ``Size(src.cols+left+right, src.rows+top+bottom)`` .
+
+    :param top:
+
+    :param bottom:
+
+    :param left:
+
+    :param right: Number of pixels in each direction from the source image rectangle to extrapolate. For example:  ``top=1, bottom=1, left=1, right=1`` mean that 1 pixel-wide border needs to be built.
+
+    :param borderType: Border type. See  :ocv:func:`borderInterpolate` for details. ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , ``BORDER_CONSTANT`` , ``BORDER_REFLECT`` and ``BORDER_WRAP`` are supported for now.
+
+    :param value: Border value.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`copyMakeBorder`
--- a/modules/cudaarithm/doc/cudaarithm.rst
+++ b/modules/cudaarithm/doc/cudaarithm.rst
@@ -0,0 +1,11 @@
+*************************************************
+gpuarithm. GPU-accelerated Operations on Matrices
+*************************************************
+
+.. toctree::
+    :maxdepth: 1
+
+    core
+    element_operations
+    reductions
+    arithm
--- a/modules/cudaarithm/doc/element_operations.rst
+++ b/modules/cudaarithm/doc/element_operations.rst
@@ -0,0 +1,543 @@
+Per-element Operations
+======================
+
+.. highlight:: cpp
+
+
+
+gpu::add
+--------
+Computes a matrix-matrix or matrix-scalar sum.
+
+.. ocv:function:: void gpu::add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar. Matrix should have the same size and type as ``src1`` .
+
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
+
+    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`add`
+
+
+
+gpu::subtract
+-------------
+Computes a matrix-matrix or matrix-scalar difference.
+
+.. ocv:function:: void gpu::subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar. Matrix should have the same size and type as ``src1`` .
+
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
+
+    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`subtract`
+
+
+
+gpu::multiply
+-------------
+Computes a matrix-matrix or matrix-scalar per-element product.
+
+.. ocv:function:: void gpu::multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
+
+    :param scale: Optional scale factor.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`multiply`
+
+
+
+gpu::divide
+-----------
+Computes a matrix-matrix or matrix-scalar division.
+
+.. ocv:function:: void gpu::divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or a scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``src1`` depth.
+
+    :param scale: Optional scale factor.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+This function, in contrast to :ocv:func:`divide`, uses a round-down rounding mode.
+
+.. seealso:: :ocv:func:`divide`
+
+
+
+gpu::absdiff
+------------
+Computes per-element absolute difference of two matrices (or of a matrix and scalar).
+
+.. ocv:function:: void gpu::absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`absdiff`
+
+
+
+gpu::abs
+--------
+Computes an absolute value of each matrix element.
+
+.. ocv:function:: void gpu::abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`abs`
+
+
+
+gpu::sqr
+--------
+Computes a square value of each matrix element.
+
+.. ocv:function:: void gpu::sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::sqrt
+---------
+Computes a square root of each matrix element.
+
+.. ocv:function:: void gpu::sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`sqrt`
+
+
+
+gpu::exp
+--------
+Computes an exponent of each matrix element.
+
+.. ocv:function:: void gpu::exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`exp`
+
+
+
+gpu::log
+--------
+Computes a natural logarithm of absolute value of each matrix element.
+
+.. ocv:function:: void gpu::log(InputArray src, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`log`
+
+
+
+gpu::pow
+--------
+Raises every matrix element to a power.
+
+.. ocv:function:: void gpu::pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param power: Exponent of power.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+The function ``pow`` raises every element of the input matrix to ``power`` :
+
+.. math::
+
+    \texttt{dst} (I) =  \fork{\texttt{src}(I)^power}{if \texttt{power} is integer}{|\texttt{src}(I)|^power}{otherwise}
+
+.. seealso:: :ocv:func:`pow`
+
+
+
+gpu::compare
+------------
+Compares elements of two matrices (or of a matrix and scalar).
+
+.. ocv:function:: void gpu::compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param cmpop: Flag specifying the relation between the elements to be checked:
+
+            * **CMP_EQ:** ``a(.) == b(.)``
+            * **CMP_GT:** ``a(.) < b(.)``
+            * **CMP_GE:** ``a(.) <= b(.)``
+            * **CMP_LT:** ``a(.) < b(.)``
+            * **CMP_LE:** ``a(.) <= b(.)``
+            * **CMP_NE:** ``a(.) != b(.)``
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`compare`
+
+
+
+gpu::bitwise_not
+----------------
+Performs a per-element bitwise inversion.
+
+.. ocv:function:: void gpu::bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_or
+---------------
+Performs a per-element bitwise disjunction of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_and
+----------------
+Performs a per-element bitwise conjunction of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_xor
+----------------
+Performs a per-element bitwise ``exclusive or`` operation of two matrices (or of matrix and scalar).
+
+.. ocv:function:: void gpu::bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::rshift
+-----------
+Performs pixel by pixel right shift of an image by a constant value.
+
+.. ocv:function:: void gpu::rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
+
+    :param val: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::lshift
+-----------
+Performs pixel by pixel right left of an image by a constant value.
+
+.. ocv:function:: void gpu::lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
+
+    :param val: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::min
+--------
+Computes the per-element minimum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`min`
+
+
+
+gpu::max
+--------
+Computes the per-element maximum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix or scalar.
+
+    :param src2: Second source matrix or scalar.
+
+    :param dst: Destination matrix that has the same size and type as the input array(s).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`max`
+
+
+
+gpu::addWeighted
+----------------
+Computes the weighted sum of two arrays.
+
+.. ocv:function:: void gpu::addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+
+    :param src1: First source array.
+
+    :param alpha: Weight for the first array elements.
+
+    :param src2: Second source array of the same size and channel number as  ``src1`` .
+
+    :param beta: Weight for the second array elements.
+
+    :param dst: Destination array that has the same size and number of channels as the input arrays.
+
+    :param gamma: Scalar added to each sum.
+
+    :param dtype: Optional depth of the destination array. When both input arrays have the same depth, ``dtype`` can be set to ``-1``, which will be equivalent to ``src1.depth()``.
+
+    :param stream: Stream for the asynchronous version.
+
+The function ``addWeighted`` calculates the weighted sum of two arrays as follows:
+
+.. math::
+
+    \texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )
+
+where ``I`` is a multi-dimensional index of array elements. In case of multi-channel arrays, each channel is processed independently.
+
+.. seealso:: :ocv:func:`addWeighted`
+
+
+
+gpu::threshold
+--------------
+Applies a fixed-level threshold to each array element.
+
+.. ocv:function:: double gpu::threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
+
+    :param src: Source array (single-channel).
+
+    :param dst: Destination array with the same size and type as  ``src`` .
+
+    :param thresh: Threshold value.
+
+    :param maxval: Maximum value to use with  ``THRESH_BINARY`` and  ``THRESH_BINARY_INV`` threshold types.
+
+    :param type: Threshold type. For details, see  :ocv:func:`threshold` . The ``THRESH_OTSU`` threshold type is not supported.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`threshold`
+
+
+
+gpu::magnitude
+--------------
+Computes magnitudes of complex matrix elements.
+
+.. ocv:function:: void gpu::magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())
+
+    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param magnitude: Destination matrix of float magnitudes ( ``CV_32FC1`` ).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`magnitude`
+
+
+
+gpu::magnitudeSqr
+-----------------
+Computes squared magnitudes of complex matrix elements.
+
+.. ocv:function:: void gpu::magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null())
+
+    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param magnitude: Destination matrix of float magnitude squares ( ``CV_32FC1`` ).
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::phase
+----------
+Computes polar angles of complex matrix elements.
+
+.. ocv:function:: void gpu::phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param angle: Destination matrix of angles ( ``CV_32FC1`` ).
+
+    :param angleInDegrees: Flag for angles that must be evaluated in degrees.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`phase`
+
+
+
+gpu::cartToPolar
+----------------
+Converts Cartesian coordinates into polar.
+
+.. ocv:function:: void gpu::cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null())
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param magnitude: Destination matrix of float magnitudes ( ``CV_32FC1`` ).
+
+    :param angle: Destination matrix of angles ( ``CV_32FC1`` ).
+
+    :param angleInDegrees: Flag for angles that must be evaluated in degrees.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`cartToPolar`
+
+
+
+gpu::polarToCart
+----------------
+Converts polar coordinates into Cartesian.
+
+.. ocv:function:: void gpu::polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null())
+
+    :param magnitude: Source matrix containing magnitudes ( ``CV_32FC1`` ).
+
+    :param angle: Source matrix containing angles ( ``CV_32FC1`` ).
+
+    :param x: Destination matrix of real components ( ``CV_32FC1`` ).
+
+    :param y: Destination matrix of imaginary components ( ``CV_32FC1`` ).
+
+    :param angleInDegrees: Flag that indicates angles in degrees.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`polarToCart`
--- a/modules/cudaarithm/doc/reductions.rst
+++ b/modules/cudaarithm/doc/reductions.rst
@@ -0,0 +1,295 @@
+Matrix Reductions
+=================
+
+.. highlight:: cpp
+
+
+
+gpu::norm
+---------
+Returns the norm of a matrix (or difference of two matrices).
+
+.. ocv:function:: double gpu::norm(InputArray src1, int normType)
+
+.. ocv:function:: double gpu::norm(InputArray src1, int normType, GpuMat& buf)
+
+.. ocv:function:: double gpu::norm(InputArray src1, int normType, InputArray mask, GpuMat& buf)
+
+.. ocv:function:: double gpu::norm(InputArray src1, InputArray src2, int normType=NORM_L2)
+
+    :param src1: Source matrix. Any matrices except 64F are supported.
+
+    :param src2: Second source matrix (if any) with the same size and type as ``src1``.
+
+    :param normType: Norm type.  ``NORM_L1`` ,  ``NORM_L2`` , and  ``NORM_INF``  are supported for now.
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`norm`
+
+
+
+gpu::sum
+--------
+Returns the sum of matrix elements.
+
+.. ocv:function:: Scalar gpu::sum(InputArray src)
+
+.. ocv:function:: Scalar gpu::sum(InputArray src, GpuMat& buf)
+
+.. ocv:function:: Scalar gpu::sum(InputArray src, InputArray mask, GpuMat& buf)
+
+    :param src: Source image of any depth except for ``CV_64F`` .
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`sum`
+
+
+
+gpu::absSum
+-----------
+Returns the sum of absolute values for matrix elements.
+
+.. ocv:function:: Scalar gpu::absSum(InputArray src)
+
+.. ocv:function:: Scalar gpu::absSum(InputArray src, GpuMat& buf)
+
+.. ocv:function:: Scalar gpu::absSum(InputArray src, InputArray mask, GpuMat& buf)
+
+    :param src: Source image of any depth except for ``CV_64F`` .
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+
+
+gpu::sqrSum
+-----------
+Returns the squared sum of matrix elements.
+
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src)
+
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src, GpuMat& buf)
+
+.. ocv:function:: Scalar gpu::sqrSum(InputArray src, InputArray mask, GpuMat& buf)
+
+    :param src: Source image of any depth except for ``CV_64F`` .
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+
+
+gpu::minMax
+-----------
+Finds global minimum and maximum matrix elements and returns their values.
+
+.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
+
+.. ocv:function:: void gpu::minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf)
+
+    :param src: Single-channel source image.
+
+    :param minVal: Pointer to the returned minimum value.  Use ``NULL``  if not required.
+
+    :param maxVal: Pointer to the returned maximum value.  Use ``NULL``  if not required.
+
+    :param mask: Optional mask to select a sub-matrix.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+The function does not work with ``CV_64F`` images on GPUs with the compute capability < 1.3.
+
+.. seealso:: :ocv:func:`minMaxLoc`
+
+
+
+gpu::minMaxLoc
+--------------
+Finds global minimum and maximum matrix elements and returns their values with locations.
+
+.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, InputArray mask=noArray())
+
+.. ocv:function:: void gpu::minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray mask, GpuMat& valbuf, GpuMat& locbuf)
+
+    :param src: Single-channel source image.
+
+    :param minVal: Pointer to the returned minimum value. Use ``NULL``  if not required.
+
+    :param maxVal: Pointer to the returned maximum value. Use ``NULL``  if not required.
+
+    :param minLoc: Pointer to the returned minimum location. Use ``NULL``  if not required.
+
+    :param maxLoc: Pointer to the returned maximum location. Use ``NULL``  if not required.
+
+    :param mask: Optional mask to select a sub-matrix.
+
+    :param valbuf: Optional values buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param locbuf: Optional locations buffer to avoid extra memory allocations. It is resized automatically.
+
+    The function does not work with ``CV_64F`` images on GPU with the compute capability < 1.3.
+
+.. seealso:: :ocv:func:`minMaxLoc`
+
+
+
+gpu::countNonZero
+-----------------
+Counts non-zero matrix elements.
+
+.. ocv:function:: int gpu::countNonZero(InputArray src)
+
+.. ocv:function:: int gpu::countNonZero(InputArray src, GpuMat& buf)
+
+    :param src: Single-channel source image.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+The function does not work with ``CV_64F`` images on GPUs with the compute capability < 1.3.
+
+.. seealso:: :ocv:func:`countNonZero`
+
+
+
+gpu::reduce
+-----------
+Reduces a matrix to a vector.
+
+.. ocv:function:: void gpu::reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())
+
+    :param mtx: Source 2D matrix.
+
+    :param vec: Destination vector. Its size and type is defined by  ``dim``  and  ``dtype``  parameters.
+
+    :param dim: Dimension index along which the matrix is reduced. 0 means that the matrix is reduced to a single row. 1 means that the matrix is reduced to a single column.
+
+    :param reduceOp: Reduction operation that could be one of the following:
+
+            * **CV_REDUCE_SUM** The output is the sum of all rows/columns of the matrix.
+
+            * **CV_REDUCE_AVG** The output is the mean vector of all rows/columns of the matrix.
+
+            * **CV_REDUCE_MAX** The output is the maximum (column/row-wise) of all rows/columns of the matrix.
+
+            * **CV_REDUCE_MIN** The output is the minimum (column/row-wise) of all rows/columns of the matrix.
+
+    :param dtype: When it is negative, the destination vector will have the same type as the source matrix. Otherwise, its type will be  ``CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), mtx.channels())`` .
+
+    :param stream: Stream for the asynchronous version.
+
+The function ``reduce`` reduces the matrix to a vector by treating the matrix rows/columns as a set of 1D vectors and performing the specified operation on the vectors until a single row/column is obtained. For example, the function can be used to compute horizontal and vertical projections of a raster image. In case of ``CV_REDUCE_SUM`` and ``CV_REDUCE_AVG`` , the output may have a larger element bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction modes.
+
+.. seealso:: :ocv:func:`reduce`
+
+
+
+gpu::meanStdDev
+---------------
+Computes a mean value and a standard deviation of matrix elements.
+
+.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev)
+.. ocv:function:: void gpu::meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)
+
+    :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.
+
+    :param mean: Mean value.
+
+    :param stddev: Standard deviation value.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`meanStdDev`
+
+
+
+gpu::rectStdDev
+---------------
+Computes a standard deviation of integral images.
+
+.. ocv:function:: void gpu::rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only the ``CV_32SC1`` type is supported.
+
+    :param sqr: Squared source image. Only  the ``CV_32FC1`` type is supported.
+
+    :param dst: Destination image with the same type and size as  ``src`` .
+
+    :param rect: Rectangular window.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::normalize
+--------------
+Normalizes the norm or value range of an array.
+
+.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
+
+.. ocv:function:: void gpu::normalize(InputArray src, OutputArray dst, double alpha, double beta, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+
+    :param src: Input array.
+
+    :param dst: Output array of the same size as  ``src`` .
+
+    :param alpha: Norm value to normalize to or the lower range boundary in case of the range normalization.
+
+    :param beta: Upper range boundary in case of the range normalization; it is not used for the norm normalization.
+
+    :param normType: Normalization type ( ``NORM_MINMAX`` , ``NORM_L2`` , ``NORM_L1`` or ``NORM_INF`` ).
+
+    :param dtype: When negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
+
+    :param mask: Optional operation mask.
+
+    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`normalize`
+
+
+
+gpu::integral
+-------------
+Computes an integral image.
+
+.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
+
+    :param buffer: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`integral`
+
+
+
+gpu::sqrIntegral
+----------------
+Computes a squared integral image.
+
+.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null())
+
+    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
+
+    :param sqsum: Squared integral image containing 64-bit unsigned integer values packed into  ``CV_64FC1`` .
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param stream: Stream for the asynchronous version.
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -0,0 +1,374 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDAARITHM_HPP__
+#define __OPENCV_CUDAARITHM_HPP__
+
+#ifndef __cplusplus
+#  error cudaarithm.hpp header must be compiled as C++
+#endif
+
+#include "opencv2/core/cuda.hpp"
+
+namespace cv { namespace cuda {
+
+//! adds one matrix to another (dst = src1 + src2)
+CV_EXPORTS void add(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
+
+//! subtracts one matrix from another (dst = src1 - src2)
+CV_EXPORTS void subtract(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes element-wise weighted product of the two arrays (dst = scale * src1 * src2)
+CV_EXPORTS void multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes element-wise weighted quotient of the two arrays (dst = scale * (src1 / src2))
+CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes element-wise weighted reciprocal of an array (dst = scale/src2)
+static inline void divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+{
+    divide(src1, src2, dst, 1.0, dtype, stream);
+}
+
+//! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+CV_EXPORTS void absdiff(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes absolute value of each matrix element
+CV_EXPORTS void abs(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes square of each pixel in an image
+CV_EXPORTS void sqr(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes square root of each pixel in an image
+CV_EXPORTS void sqrt(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes exponent of each matrix element
+CV_EXPORTS void exp(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes natural logarithm of absolute value of each matrix element
+CV_EXPORTS void log(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes power of each matrix element:
+//!    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
+//!    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
+CV_EXPORTS void pow(InputArray src, double power, OutputArray dst, Stream& stream = Stream::Null());
+
+//! compares elements of two arrays (dst = src1 <cmpop> src2)
+CV_EXPORTS void compare(InputArray src1, InputArray src2, OutputArray dst, int cmpop, Stream& stream = Stream::Null());
+
+//! performs per-elements bit-wise inversion
+CV_EXPORTS void bitwise_not(InputArray src, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise disjunction of two arrays
+CV_EXPORTS void bitwise_or(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise conjunction of two arrays
+CV_EXPORTS void bitwise_and(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! calculates per-element bit-wise "exclusive or" operation
+CV_EXPORTS void bitwise_xor(InputArray src1, InputArray src2, OutputArray dst, InputArray mask = noArray(), Stream& stream = Stream::Null());
+
+//! pixel by pixel right shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with integers elements
+CV_EXPORTS void rshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+//! pixel by pixel left shift of an image by a constant value
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
+CV_EXPORTS void lshift(InputArray src, Scalar_<int> val, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(InputArray src1, InputArray src2, OutputArray dst, Stream& stream = Stream::Null());
+
+//! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
+CV_EXPORTS void addWeighted(InputArray src1, double alpha, InputArray src2, double beta, double gamma, OutputArray dst,
+                            int dtype = -1, Stream& stream = Stream::Null());
+
+//! adds scaled array to another one (dst = alpha*src1 + src2)
+static inline void scaleAdd(InputArray src1, double alpha, InputArray src2, OutputArray dst, Stream& stream = Stream::Null())
+{
+    addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
+}
+
+//! applies fixed threshold to the image
+CV_EXPORTS double threshold(InputArray src, OutputArray dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
+
+//! computes magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitude(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
+
+//! computes squared magnitude of complex (x(i).re, x(i).im) vector
+//! supports only CV_32FC2 type
+CV_EXPORTS void magnitudeSqr(InputArray xy, OutputArray magnitude, Stream& stream = Stream::Null());
+
+//! computes magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitude(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
+
+//! computes squared magnitude of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void magnitudeSqr(InputArray x, InputArray y, OutputArray magnitude, Stream& stream = Stream::Null());
+
+//! computes angle of each (x(i), y(i)) vector
+//! supports only floating-point source
+CV_EXPORTS void phase(InputArray x, InputArray y, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+//! converts Cartesian coordinates to polar
+//! supports only floating-point source
+CV_EXPORTS void cartToPolar(InputArray x, InputArray y, OutputArray magnitude, OutputArray angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+//! converts polar coordinates to Cartesian
+//! supports only floating-point source
+CV_EXPORTS void polarToCart(InputArray magnitude, InputArray angle, OutputArray x, OutputArray y, bool angleInDegrees = false, Stream& stream = Stream::Null());
+
+//! makes multi-channel array out of several single-channel arrays
+CV_EXPORTS void merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream = Stream::Null());
+CV_EXPORTS void merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream = Stream::Null());
+
+//! copies each plane of a multi-channel array to a dedicated array
+CV_EXPORTS void split(InputArray src, GpuMat* dst, Stream& stream = Stream::Null());
+CV_EXPORTS void split(InputArray src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
+
+//! transposes the matrix
+//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
+CV_EXPORTS void transpose(InputArray src1, OutputArray dst, Stream& stream = Stream::Null());
+
+//! reverses the order of the rows, columns or both in a matrix
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
+CV_EXPORTS void flip(InputArray src, OutputArray dst, int flipCode, Stream& stream = Stream::Null());
+
+//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
+//! destination array will have the depth type as lut and the same channels number as source
+//! supports CV_8UC1, CV_8UC3 types
+class CV_EXPORTS LookUpTable : public Algorithm
+{
+public:
+    virtual void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) = 0;
+};
+
+CV_EXPORTS Ptr<LookUpTable> createLookUpTable(InputArray lut);
+
+//! copies 2D array to a larger destination array and pads borders with user-specifiable constant
+CV_EXPORTS void copyMakeBorder(InputArray src, OutputArray dst, int top, int bottom, int left, int right, int borderType,
+                               Scalar value = Scalar(), Stream& stream = Stream::Null());
+
+//! computes norm of array
+//! supports NORM_INF, NORM_L1, NORM_L2
+//! supports all matrices except 64F
+CV_EXPORTS double norm(InputArray src1, int normType, InputArray mask, GpuMat& buf);
+static inline double norm(InputArray src, int normType)
+{
+    GpuMat buf;
+    return norm(src, normType, GpuMat(), buf);
+}
+static inline double norm(InputArray src, int normType, GpuMat& buf)
+{
+    return norm(src, normType, GpuMat(), buf);
+}
+
+//! computes norm of the difference between two arrays
+//! supports NORM_INF, NORM_L1, NORM_L2
+//! supports only CV_8UC1 type
+CV_EXPORTS double norm(InputArray src1, InputArray src2, GpuMat& buf, int normType=NORM_L2);
+static inline double norm(InputArray src1, InputArray src2, int normType=NORM_L2)
+{
+    GpuMat buf;
+    return norm(src1, src2, buf, normType);
+}
+
+//! computes sum of array elements
+//! supports only single channel images
+CV_EXPORTS Scalar sum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar sum(InputArray src)
+{
+    GpuMat buf;
+    return sum(src, GpuMat(), buf);
+}
+static inline Scalar sum(InputArray src, GpuMat& buf)
+{
+    return sum(src, GpuMat(), buf);
+}
+
+//! computes sum of array elements absolute values
+//! supports only single channel images
+CV_EXPORTS Scalar absSum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar absSum(InputArray src)
+{
+    GpuMat buf;
+    return absSum(src, GpuMat(), buf);
+}
+static inline Scalar absSum(InputArray src, GpuMat& buf)
+{
+    return absSum(src, GpuMat(), buf);
+}
+
+//! computes squared sum of array elements
+//! supports only single channel images
+CV_EXPORTS Scalar sqrSum(InputArray src, InputArray mask, GpuMat& buf);
+static inline Scalar sqrSum(InputArray src)
+{
+    GpuMat buf;
+    return sqrSum(src, GpuMat(), buf);
+}
+static inline Scalar sqrSum(InputArray src, GpuMat& buf)
+{
+    return sqrSum(src, GpuMat(), buf);
+}
+
+//! finds global minimum and maximum array elements and returns their values
+CV_EXPORTS void minMax(InputArray src, double* minVal, double* maxVal, InputArray mask, GpuMat& buf);
+static inline void minMax(InputArray src, double* minVal, double* maxVal=0, InputArray mask=noArray())
+{
+    GpuMat buf;
+    minMax(src, minVal, maxVal, mask, buf);
+}
+
+//! finds global minimum and maximum array elements and returns their values with locations
+CV_EXPORTS void minMaxLoc(InputArray src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                          InputArray mask, GpuMat& valbuf, GpuMat& locbuf);
+static inline void minMaxLoc(InputArray src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
+                             InputArray mask=noArray())
+{
+    GpuMat valBuf, locBuf;
+    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
+}
+
+//! counts non-zero array elements
+CV_EXPORTS int countNonZero(InputArray src, GpuMat& buf);
+static inline int countNonZero(const GpuMat& src)
+{
+    GpuMat buf;
+    return countNonZero(src, buf);
+}
+
+//! reduces a matrix to a vector
+CV_EXPORTS void reduce(InputArray mtx, OutputArray vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
+
+//! computes mean value and standard deviation of all or selected array elements
+//! supports only CV_8UC1 type
+CV_EXPORTS void meanStdDev(InputArray mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
+static inline void meanStdDev(InputArray src, Scalar& mean, Scalar& stddev)
+{
+    GpuMat buf;
+    meanStdDev(src, mean, stddev, buf);
+}
+
+//! computes the standard deviation of integral images
+//! supports only CV_32SC1 source type and CV_32FC1 sqr type
+//! output will have CV_32FC1 type
+CV_EXPORTS void rectStdDev(InputArray src, InputArray sqr, OutputArray dst, Rect rect, Stream& stream = Stream::Null());
+
+//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
+CV_EXPORTS void normalize(InputArray src, OutputArray dst, double alpha, double beta,
+                          int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf);
+static inline void normalize(InputArray src, OutputArray dst, double alpha = 1, double beta = 0,
+                             int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray())
+{
+    GpuMat norm_buf;
+    GpuMat cvt_buf;
+    normalize(src, dst, alpha, beta, norm_type, dtype, mask, norm_buf, cvt_buf);
+}
+
+//! computes the integral image
+//! sum will have CV_32S type, but will contain unsigned int values
+//! supports only CV_8UC1 source type
+CV_EXPORTS void integral(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null());
+static inline void integralBuffered(InputArray src, OutputArray sum, GpuMat& buffer, Stream& stream = Stream::Null())
+{
+    integral(src, sum, buffer, stream);
+}
+static inline void integral(InputArray src, OutputArray sum, Stream& stream = Stream::Null())
+{
+    GpuMat buffer;
+    integral(src, sum, buffer, stream);
+}
+
+//! computes squared integral image
+//! result matrix will have 64F type, but will contain 64U values
+//! supports source images of 8UC1 type only
+CV_EXPORTS void sqrIntegral(InputArray src, OutputArray sqsum, GpuMat& buf, Stream& stream = Stream::Null());
+static inline void sqrIntegral(InputArray src, OutputArray sqsum, Stream& stream = Stream::Null())
+{
+    GpuMat buffer;
+    sqrIntegral(src, sqsum, buffer, stream);
+}
+
+CV_EXPORTS void gemm(InputArray src1, InputArray src2, double alpha,
+                     InputArray src3, double beta, OutputArray dst, int flags = 0, Stream& stream = Stream::Null());
+
+//! performs per-element multiplication of two full (not packed) Fourier spectrums
+//! supports 32FC2 matrixes only (interleaved format)
+CV_EXPORTS void mulSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, bool conjB=false, Stream& stream = Stream::Null());
+
+//! performs per-element multiplication of two full (not packed) Fourier spectrums
+//! supports 32FC2 matrixes only (interleaved format)
+CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArray dst, int flags, float scale, bool conjB=false, Stream& stream = Stream::Null());
+
+//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
+//! Param dft_size is the size of DFT transform.
+//!
+//! If the source matrix is not continous, then additional copy will be done,
+//! so to avoid copying ensure the source matrix is continous one. If you want to use
+//! preallocated output ensure it is continuous too, otherwise it will be reallocated.
+//!
+//! Being implemented via CUFFT real-to-complex transform result contains only non-redundant values
+//! in CUFFT's format. Result as full complex matrix for such kind of transform cannot be retrieved.
+//!
+//! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format.
+CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
+
+//! computes convolution (or cross-correlation) of two images using discrete Fourier transform
+//! supports source images of 32FC1 type only
+//! result matrix will have 32FC1 type
+class CV_EXPORTS Convolution : public Algorithm
+{
+public:
+    virtual void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null()) = 0;
+};
+
+CV_EXPORTS Ptr<Convolution> createConvolution(Size user_block_size = Size());
+
+}} // namespace cv { namespace cuda {
+
+#endif /* __OPENCV_CUDAARITHM_HPP__ */
--- a/modules/cudaarithm/perf/perf_arithm.cpp
+++ b/modules/cudaarithm/perf/perf_arithm.cpp
@@ -0,0 +1,307 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+//////////////////////////////////////////////////////////////////////
+// GEMM
+
+CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T)
+#define ALL_GEMM_FLAGS Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), \
+                              GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))
+
+DEF_PARAM_TEST(Sz_Type_Flags, cv::Size, MatType, GemmFlags);
+
+PERF_TEST_P(Sz_Type_Flags, GEMM,
+            Combine(Values(cv::Size(512, 512), cv::Size(1024, 1024)),
+                    Values(CV_32FC1, CV_32FC2, CV_64FC1),
+                    ALL_GEMM_FLAGS))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int flags = GET_PARAM(2);
+
+    cv::Mat src1(size, type);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, type);
+    declare.in(src2, WARMUP_RNG);
+
+    cv::Mat src3(size, type);
+    declare.in(src3, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        declare.time(5.0);
+
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        const cv::cuda::GpuMat d_src3(src3);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, dst, flags);
+
+        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        declare.time(50.0);
+
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::gemm(src1, src2, 1.0, src3, 1.0, dst, flags);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MulSpectrums
+
+CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
+
+DEF_PARAM_TEST(Sz_Flags, cv::Size, DftFlags);
+
+PERF_TEST_P(Sz_Flags, MulSpectrums,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(0, DftFlags(cv::DFT_ROWS))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int flag = GET_PARAM(1);
+
+    cv::Mat a(size, CV_32FC2);
+    cv::Mat b(size, CV_32FC2);
+    declare.in(a, b, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_a(a);
+        const cv::cuda::GpuMat d_b(b);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::mulSpectrums(d_a, d_b, dst, flag);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::mulSpectrums(a, b, dst, flag);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MulAndScaleSpectrums
+
+PERF_TEST_P(Sz, MulAndScaleSpectrums,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    const float scale = 1.f / size.area();
+
+    cv::Mat src1(size, CV_32FC2);
+    cv::Mat src2(size, CV_32FC2);
+    declare.in(src1,src2, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::mulAndScaleSpectrums(d_src1, d_src2, dst, cv::DFT_ROWS, scale, false);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Dft
+
+PERF_TEST_P(Sz_Flags, Dft,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(0, DftFlags(cv::DFT_ROWS), DftFlags(cv::DFT_INVERSE))))
+{
+    declare.time(10.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int flag = GET_PARAM(1);
+
+    cv::Mat src(size, CV_32FC2);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::dft(d_src, dst, size, flag);
+
+        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::dft(src, dst, flag);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Convolve
+
+DEF_PARAM_TEST(Sz_KernelSz_Ccorr, cv::Size, int, bool);
+
+PERF_TEST_P(Sz_KernelSz_Ccorr, Convolve,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(17, 27, 32, 64),
+                    Bool()))
+{
+    declare.time(10.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int templ_size = GET_PARAM(1);
+    const bool ccorr = GET_PARAM(2);
+
+    const cv::Mat image(size, CV_32FC1);
+    const cv::Mat templ(templ_size, templ_size, CV_32FC1);
+    declare.in(image, templ, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::cuda::GpuMat d_image = cv::cuda::createContinuous(size, CV_32FC1);
+        d_image.upload(image);
+
+        cv::cuda::GpuMat d_templ = cv::cuda::createContinuous(templ_size, templ_size, CV_32FC1);
+        d_templ.upload(templ);
+
+        cv::Ptr<cv::cuda::Convolution> convolution = cv::cuda::createConvolution();
+
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() convolution->convolve(d_image, d_templ, dst, ccorr);
+
+        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        if (ccorr)
+            FAIL_NO_CPU();
+
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(image, dst, image.depth(), templ);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Integral
+
+PERF_TEST_P(Sz, Integral,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+        cv::cuda::GpuMat d_buf;
+
+        TEST_CYCLE() cv::cuda::integral(d_src, dst, d_buf);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::integral(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// IntegralSqr
+
+PERF_TEST_P(Sz, IntegralSqr,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst, buf;
+
+        TEST_CYCLE() cv::cuda::sqrIntegral(d_src, dst, buf);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
--- a/modules/cudaarithm/perf/perf_core.cpp
+++ b/modules/cudaarithm/perf/perf_core.cpp
@@ -0,0 +1,321 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+#define ARITHM_MAT_DEPTH Values(CV_8U, CV_16U, CV_32F, CV_64F)
+
+//////////////////////////////////////////////////////////////////////
+// Merge
+
+PERF_TEST_P(Sz_Depth_Cn, Merge,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH,
+                    Values(2, 3, 4)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    std::vector<cv::Mat> src(channels);
+    for (int i = 0; i < channels; ++i)
+    {
+        src[i].create(size, depth);
+        declare.in(src[i], WARMUP_RNG);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        std::vector<cv::cuda::GpuMat> d_src(channels);
+        for (int i = 0; i < channels; ++i)
+            d_src[i].upload(src[i]);
+
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::merge(d_src, dst);
+
+        GPU_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::merge(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Split
+
+PERF_TEST_P(Sz_Depth_Cn, Split,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    ARITHM_MAT_DEPTH,
+                    Values(2, 3, 4)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    cv::Mat src(size, CV_MAKE_TYPE(depth, channels));
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        std::vector<cv::cuda::GpuMat> dst;
+
+        TEST_CYCLE() cv::cuda::split(d_src, dst);
+
+        const cv::cuda::GpuMat& dst0 = dst[0];
+        const cv::cuda::GpuMat& dst1 = dst[1];
+
+        GPU_SANITY_CHECK(dst0, 1e-10);
+        GPU_SANITY_CHECK(dst1, 1e-10);
+    }
+    else
+    {
+        std::vector<cv::Mat> dst;
+
+        TEST_CYCLE() cv::split(src, dst);
+
+        const cv::Mat& dst0 = dst[0];
+        const cv::Mat& dst1 = dst[1];
+
+        CPU_SANITY_CHECK(dst0);
+        CPU_SANITY_CHECK(dst1);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Transpose
+
+PERF_TEST_P(Sz_Type, Transpose,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32SC2, CV_64FC1)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::transpose(d_src, dst);
+
+        GPU_SANITY_CHECK(dst, 1e-10);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::transpose(src, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Flip
+
+enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
+CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Code, cv::Size, MatDepth, MatCn, FlipCode);
+
+PERF_TEST_P(Sz_Depth_Cn_Code, Flip,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    FlipCode::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int flipCode = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::flip(d_src, dst, flipCode);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::flip(src, dst, flipCode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// LutOneChannel
+
+PERF_TEST_P(Sz_Type, LutOneChannel,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC3)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat lut(1, 256, CV_8UC1);
+    declare.in(lut, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::LUT(src, lut, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// LutMultiChannel
+
+PERF_TEST_P(Sz_Type, LutMultiChannel,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values<MatType>(CV_8UC3)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat lut(1, 256, CV_MAKE_TYPE(CV_8U, src.channels()));
+    declare.in(lut, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() lutAlg->transform(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::LUT(src, lut, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CopyMakeBorder
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Border, cv::Size, MatDepth, MatCn, BorderMode);
+
+PERF_TEST_P(Sz_Depth_Cn_Border, CopyMakeBorder,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4,
+                    ALL_BORDER_MODES))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int borderMode = GET_PARAM(3);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::copyMakeBorder(d_src, dst, 5, 5, 5, 5, borderMode);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::copyMakeBorder(src, dst, 5, 5, 5, 5, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
--- a/modules/cudaarithm/perf/perf_element_operations.cpp
+++ b/modules/cudaarithm/perf/perf_element_operations.cpp
--- a/modules/cudaarithm/perf/perf_main.cpp
+++ b/modules/cudaarithm/perf/perf_main.cpp
@@ -0,0 +1,47 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace perf;
+
+CV_PERF_TEST_CUDA_MAIN(cudaarithm)
--- a/modules/cudaarithm/perf/perf_precomp.cpp
+++ b/modules/cudaarithm/perf/perf_precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
--- a/modules/cudaarithm/perf/perf_precomp.hpp
+++ b/modules/cudaarithm/perf/perf_precomp.hpp
@@ -0,0 +1,65 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_perf.hpp"
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
--- a/modules/cudaarithm/perf/perf_reductions.cpp
+++ b/modules/cudaarithm/perf/perf_reductions.cpp
@@ -0,0 +1,467 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+//////////////////////////////////////////////////////////////////////
+// Norm
+
+DEF_PARAM_TEST(Sz_Depth_Norm, cv::Size, MatDepth, NormType);
+
+PERF_TEST_P(Sz_Depth_Norm, Norm,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32S, CV_32F),
+                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int normType = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        double gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src, normType, d_buf);
+
+        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        double cpu_dst;
+
+        TEST_CYCLE() cpu_dst = cv::norm(src, normType);
+
+        SANITY_CHECK(cpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// NormDiff
+
+DEF_PARAM_TEST(Sz_Norm, cv::Size, NormType);
+
+PERF_TEST_P(Sz_Norm, NormDiff,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);
+
+    cv::Mat src1(size, CV_8UC1);
+    declare.in(src1, WARMUP_RNG);
+
+    cv::Mat src2(size, CV_8UC1);
+    declare.in(src2, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src1(src1);
+        const cv::cuda::GpuMat d_src2(src2);
+        cv::cuda::GpuMat d_buf;
+        double gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::norm(d_src1, d_src2, d_buf, normType);
+
+        SANITY_CHECK(gpu_dst);
+
+    }
+    else
+    {
+        double cpu_dst;
+
+        TEST_CYCLE() cpu_dst = cv::norm(src1, src2, normType);
+
+        SANITY_CHECK(cpu_dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Sum
+
+PERF_TEST_P(Sz_Depth_Cn, Sum,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        cv::Scalar gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::sum(d_src, d_buf);
+
+        SANITY_CHECK(gpu_dst, 1e-5, ERROR_RELATIVE);
+    }
+    else
+    {
+        cv::Scalar cpu_dst;
+
+        TEST_CYCLE() cpu_dst = cv::sum(src);
+
+        SANITY_CHECK(cpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SumAbs
+
+PERF_TEST_P(Sz_Depth_Cn, SumAbs,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        cv::Scalar gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::absSum(d_src, d_buf);
+
+        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SumSqr
+
+PERF_TEST_P(Sz_Depth_Cn, SumSqr,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values<MatDepth>(CV_8U, CV_16U, CV_32F),
+                    GPU_CHANNELS_1_3_4))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        cv::Scalar gpu_dst;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::sqrSum(d_src, d_buf);
+
+        SANITY_CHECK(gpu_dst, 1e-6, ERROR_RELATIVE);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MinMax
+
+PERF_TEST_P(Sz_Depth, MinMax,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        double gpu_minVal, gpu_maxVal;
+
+        TEST_CYCLE() cv::cuda::minMax(d_src, &gpu_minVal, &gpu_maxVal, cv::cuda::GpuMat(), d_buf);
+
+        SANITY_CHECK(gpu_minVal, 1e-10);
+        SANITY_CHECK(gpu_maxVal, 1e-10);
+    }
+    else
+    {
+        double cpu_minVal, cpu_maxVal;
+
+        TEST_CYCLE() cv::minMaxLoc(src, &cpu_minVal, &cpu_maxVal);
+
+        SANITY_CHECK(cpu_minVal);
+        SANITY_CHECK(cpu_maxVal);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MinMaxLoc
+
+PERF_TEST_P(Sz_Depth, MinMaxLoc,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    if (depth == CV_8U)
+        cv::randu(src, 0, 254);
+    else
+        declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_valbuf, d_locbuf;
+        double gpu_minVal, gpu_maxVal;
+        cv::Point gpu_minLoc, gpu_maxLoc;
+
+        TEST_CYCLE() cv::cuda::minMaxLoc(d_src, &gpu_minVal, &gpu_maxVal, &gpu_minLoc, &gpu_maxLoc, cv::cuda::GpuMat(), d_valbuf, d_locbuf);
+
+        SANITY_CHECK(gpu_minVal, 1e-10);
+        SANITY_CHECK(gpu_maxVal, 1e-10);
+    }
+    else
+    {
+        double cpu_minVal, cpu_maxVal;
+        cv::Point cpu_minLoc, cpu_maxLoc;
+
+        TEST_CYCLE() cv::minMaxLoc(src, &cpu_minVal, &cpu_maxVal, &cpu_minLoc, &cpu_maxLoc);
+
+        SANITY_CHECK(cpu_minVal);
+        SANITY_CHECK(cpu_maxVal);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CountNonZero
+
+PERF_TEST_P(Sz_Depth, CountNonZero,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        int gpu_dst = 0;
+
+        TEST_CYCLE() gpu_dst = cv::cuda::countNonZero(d_src, d_buf);
+
+        SANITY_CHECK(gpu_dst);
+    }
+    else
+    {
+        int cpu_dst = 0;
+
+        TEST_CYCLE() cpu_dst = cv::countNonZero(src);
+
+        SANITY_CHECK(cpu_dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Reduce
+
+CV_ENUM(ReduceCode, REDUCE_SUM, REDUCE_AVG, REDUCE_MAX, REDUCE_MIN)
+
+enum {Rows = 0, Cols = 1};
+CV_ENUM(ReduceDim, Rows, Cols)
+
+DEF_PARAM_TEST(Sz_Depth_Cn_Code_Dim, cv::Size, MatDepth, MatCn, ReduceCode, ReduceDim);
+
+PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Reduce,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_16S, CV_32F),
+                    Values(1, 2, 3, 4),
+                    ReduceCode::all(),
+                    ReduceDim::all()))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int reduceOp = GET_PARAM(3);
+    const int dim = GET_PARAM(4);
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+
+        TEST_CYCLE() cv::cuda::reduce(d_src, dst, dim, reduceOp);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::reduce(src, dst, dim, reduceOp);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Normalize
+
+DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);
+
+PERF_TEST_P(Sz_Depth_NormType, Normalize,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16U, CV_32F, CV_64F),
+                    Values(NormType(cv::NORM_INF),
+                           NormType(cv::NORM_L1),
+                           NormType(cv::NORM_L2),
+                           NormType(cv::NORM_MINMAX))))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int norm_type = GET_PARAM(2);
+
+    const double alpha = 1;
+    const double beta = 0;
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat dst;
+        cv::cuda::GpuMat d_norm_buf, d_cvt_buf;
+
+        TEST_CYCLE() cv::cuda::normalize(d_src, dst, alpha, beta, norm_type, type, cv::cuda::GpuMat(), d_norm_buf, d_cvt_buf);
+
+        GPU_SANITY_CHECK(dst, 1e-6);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::normalize(src, dst, alpha, beta, norm_type, type);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MeanStdDev
+
+PERF_TEST_P(Sz, MeanStdDev,
+            GPU_TYPICAL_MAT_SIZES)
+{
+    const cv::Size size = GetParam();
+
+    cv::Mat src(size, CV_8UC1);
+    declare.in(src, WARMUP_RNG);
+
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::cuda::GpuMat d_src(src);
+        cv::cuda::GpuMat d_buf;
+        cv::Scalar gpu_mean;
+        cv::Scalar gpu_stddev;
+
+        TEST_CYCLE() cv::cuda::meanStdDev(d_src, gpu_mean, gpu_stddev, d_buf);
+
+        SANITY_CHECK(gpu_mean);
+        SANITY_CHECK(gpu_stddev);
+    }
+    else
+    {
+        cv::Scalar cpu_mean;
+        cv::Scalar cpu_stddev;
+
+        TEST_CYCLE() cv::meanStdDev(src, cpu_mean, cpu_stddev);
+
+        SANITY_CHECK(cpu_mean);
+        SANITY_CHECK(cpu_stddev);
+    }
+}
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@@ -0,0 +1,636 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::gemm(InputArray, InputArray, double, InputArray, double, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::mulSpectrums(InputArray, InputArray, OutputArray, int, bool, Stream&) { throw_no_cuda(); }
+void cv::cuda::mulAndScaleSpectrums(InputArray, InputArray, OutputArray, int, float, bool, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::dft(InputArray, OutputArray, Size, int, Stream&) { throw_no_cuda(); }
+
+Ptr<Convolution> cv::cuda::createConvolution(Size) { throw_no_cuda(); return Ptr<Convolution>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace
+{
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        const char* str;
+    };
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {}
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    String getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
+        String str = cv::format("%s [Code = %d]", msg, code);
+
+        return str;
+    }
+}
+
+#ifdef HAVE_CUBLAS
+    namespace
+    {
+        const ErrorEntry cublas_errors[] =
+        {
+            error_entry( CUBLAS_STATUS_SUCCESS ),
+            error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
+            error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
+            error_entry( CUBLAS_STATUS_INVALID_VALUE ),
+            error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
+            error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
+            error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
+            error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
+        };
+
+        const size_t cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
+
+        static inline void ___cublasSafeCall(cublasStatus_t err, const char* file, const int line, const char* func)
+        {
+            if (CUBLAS_STATUS_SUCCESS != err)
+            {
+                String msg = getErrorString(err, cublas_errors, cublas_error_num);
+                cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
+            }
+        }
+    }
+
+    #if defined(__GNUC__)
+        #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
+    #else /* defined(__CUDACC__) || defined(__MSVC__) */
+        #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, "")
+    #endif
+#endif // HAVE_CUBLAS
+
+#ifdef HAVE_CUFFT
+    namespace
+    {
+        //////////////////////////////////////////////////////////////////////////
+        // CUFFT errors
+
+        const ErrorEntry cufft_errors[] =
+        {
+            error_entry( CUFFT_INVALID_PLAN ),
+            error_entry( CUFFT_ALLOC_FAILED ),
+            error_entry( CUFFT_INVALID_TYPE ),
+            error_entry( CUFFT_INVALID_VALUE ),
+            error_entry( CUFFT_INTERNAL_ERROR ),
+            error_entry( CUFFT_EXEC_FAILED ),
+            error_entry( CUFFT_SETUP_FAILED ),
+            error_entry( CUFFT_INVALID_SIZE ),
+            error_entry( CUFFT_UNALIGNED_DATA )
+        };
+
+        const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
+
+        void ___cufftSafeCall(int err, const char* file, const int line, const char* func)
+        {
+            if (CUFFT_SUCCESS != err)
+            {
+                String msg = getErrorString(err, cufft_errors, cufft_error_num);
+                cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
+            }
+        }
+    }
+
+    #if defined(__GNUC__)
+        #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
+    #else /* defined(__CUDACC__) || defined(__MSVC__) */
+        #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, "")
+    #endif
+
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// gemm
+
+void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray _src3, double beta, OutputArray _dst, int flags, Stream& stream)
+{
+#ifndef HAVE_CUBLAS
+    (void) _src1;
+    (void) _src2;
+    (void) alpha;
+    (void) _src3;
+    (void) beta;
+    (void) _dst;
+    (void) flags;
+    (void) stream;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUBLAS");
+#else
+    // CUBLAS works with column-major matrices
+
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+    GpuMat src3 = _src3.getGpuMat();
+
+    CV_Assert( src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2 );
+    CV_Assert( src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()) );
+
+    if (src1.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    bool tr1 = (flags & GEMM_1_T) != 0;
+    bool tr2 = (flags & GEMM_2_T) != 0;
+    bool tr3 = (flags & GEMM_3_T) != 0;
+
+    if (src1.type() == CV_64FC2)
+    {
+        if (tr1 || tr2 || tr3)
+            CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
+    }
+
+    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
+    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
+    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
+    Size dstSize(src2Size.width, src1Size.height);
+
+    CV_Assert( src1Size.width == src2Size.height );
+    CV_Assert( src3.empty() || src3Size == dstSize );
+
+    _dst.create(dstSize, src1.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    if (beta != 0)
+    {
+        if (src3.empty())
+        {
+            dst.setTo(Scalar::all(0), stream);
+        }
+        else
+        {
+            if (tr3)
+            {
+                cuda::transpose(src3, dst, stream);
+            }
+            else
+            {
+                src3.copyTo(dst, stream);
+            }
+        }
+    }
+
+    cublasHandle_t handle;
+    cublasSafeCall( cublasCreate_v2(&handle) );
+
+    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );
+
+    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );
+
+    const float alphaf = static_cast<float>(alpha);
+    const float betaf = static_cast<float>(beta);
+
+    const cuComplex alphacf = make_cuComplex(alphaf, 0);
+    const cuComplex betacf = make_cuComplex(betaf, 0);
+
+    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
+    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);
+
+    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+    switch (src1.type())
+    {
+    case CV_32FC1:
+        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphaf,
+            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
+            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
+            &betaf,
+            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
+        break;
+
+    case CV_64FC1:
+        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alpha,
+            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
+            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
+            &beta,
+            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
+        break;
+
+    case CV_32FC2:
+        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphacf,
+            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
+            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
+            &betacf,
+            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
+        break;
+
+    case CV_64FC2:
+        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphac,
+            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
+            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
+            &betac,
+            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
+        break;
+    }
+
+    cublasSafeCall( cublasDestroy_v2(handle) );
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// mulSpectrums
+
+#ifdef HAVE_CUFFT
+
+namespace cv { namespace cuda { namespace device
+{
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
+
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream);
+}}}
+
+#endif
+
+void cv::cuda::mulSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, bool conjB, Stream& stream)
+{
+#ifndef HAVE_CUFFT
+    (void) _src1;
+    (void) _src2;
+    (void) _dst;
+    (void) flags;
+    (void) conjB;
+    (void) stream;
+    throw_no_cuda();
+#else
+    (void) flags;
+
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, PtrStepSz<cufftComplex>, cudaStream_t stream);
+    static Caller callers[] = { device::mulSpectrums, device::mulSpectrums_CONJ };
+
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2 );
+    CV_Assert( src1.size() == src2.size() );
+
+    _dst.create(src1.size(), CV_32FC2);
+    GpuMat dst = _dst.getGpuMat();
+
+    Caller caller = callers[(int)conjB];
+    caller(src1, src2, dst, StreamAccessor::getStream(stream));
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// mulAndScaleSpectrums
+
+#ifdef HAVE_CUFFT
+
+namespace cv { namespace cuda { namespace device
+{
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
+
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream);
+}}}
+
+#endif
+
+void cv::cuda::mulAndScaleSpectrums(InputArray _src1, InputArray _src2, OutputArray _dst, int flags, float scale, bool conjB, Stream& stream)
+{
+#ifndef HAVE_CUFFT
+    (void) _src1;
+    (void) _src2;
+    (void) _dst;
+    (void) flags;
+    (void) scale;
+    (void) conjB;
+    (void) stream;
+    throw_no_cuda();
+#else
+    (void)flags;
+
+    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, PtrStepSz<cufftComplex>, cudaStream_t stream);
+    static Caller callers[] = { device::mulAndScaleSpectrums, device::mulAndScaleSpectrums_CONJ };
+
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+
+    CV_Assert( src1.type() == src2.type() && src1.type() == CV_32FC2);
+    CV_Assert( src1.size() == src2.size() );
+
+    _dst.create(src1.size(), CV_32FC2);
+    GpuMat dst = _dst.getGpuMat();
+
+    Caller caller = callers[(int)conjB];
+    caller(src1, src2, scale, dst, StreamAccessor::getStream(stream));
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// dft
+
+void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
+{
+#ifndef HAVE_CUFFT
+    (void) _src;
+    (void) _dst;
+    (void) dft_size;
+    (void) flags;
+    (void) stream;
+    throw_no_cuda();
+#else
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
+
+    // We don't support unpacked output (in the case of real input)
+    CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
+
+    const bool is_1d_input       = (dft_size.height == 1) || (dft_size.width == 1);
+    const bool is_row_dft        = (flags & DFT_ROWS) != 0;
+    const bool is_scaled_dft     = (flags & DFT_SCALE) != 0;
+    const bool is_inverse        = (flags & DFT_INVERSE) != 0;
+    const bool is_complex_input  = src.channels() == 2;
+    const bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
+
+    // We don't support real-to-real transform
+    CV_Assert( is_complex_input || is_complex_output );
+
+    GpuMat src_cont = src;
+
+    // Make sure here we work with the continuous input,
+    // as CUFFT can't handle gaps
+    createContinuous(src.rows, src.cols, src.type(), src_cont);
+    if (src_cont.data != src.data)
+        src.copyTo(src_cont, stream);
+
+    Size dft_size_opt = dft_size;
+    if (is_1d_input && !is_row_dft)
+    {
+        // If the source matrix is single column handle it as single row
+        dft_size_opt.width = std::max(dft_size.width, dft_size.height);
+        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
+    }
+
+    CV_Assert( dft_size_opt.width > 1 );
+
+    cufftType dft_type = CUFFT_R2C;
+    if (is_complex_input)
+        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
+
+    cufftHandle plan;
+    if (is_1d_input || is_row_dft)
+        cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
+    else
+        cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
+
+    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
+
+    if (is_complex_input)
+    {
+        if (is_complex_output)
+        {
+            createContinuous(dft_size, CV_32FC2, _dst);
+            GpuMat dst = _dst.getGpuMat();
+
+            cufftSafeCall(cufftExecC2C(
+                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
+                    is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+        }
+        else
+        {
+            createContinuous(dft_size, CV_32F, _dst);
+            GpuMat dst = _dst.getGpuMat();
+
+            cufftSafeCall(cufftExecC2R(
+                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
+        }
+    }
+    else
+    {
+        // We could swap dft_size for efficiency. Here we must reflect it
+        if (dft_size == dft_size_opt)
+            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
+        else
+            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
+
+        GpuMat dst = _dst.getGpuMat();
+
+        cufftSafeCall(cufftExecR2C(
+                plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
+    }
+
+    cufftSafeCall( cufftDestroy(plan) );
+
+    if (is_scaled_dft)
+        cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Convolution
+
+#ifdef HAVE_CUFFT
+
+namespace
+{
+    class ConvolutionImpl : public Convolution
+    {
+    public:
+        explicit ConvolutionImpl(Size user_block_size_) : user_block_size(user_block_size_) {}
+
+        void convolve(InputArray image, InputArray templ, OutputArray result, bool ccorr = false, Stream& stream = Stream::Null());
+
+    private:
+        void create(Size image_size, Size templ_size);
+        static Size estimateBlockSize(Size result_size);
+
+        Size result_size;
+        Size block_size;
+        Size user_block_size;
+        Size dft_size;
+        int spect_len;
+
+        GpuMat image_spect, templ_spect, result_spect;
+        GpuMat image_block, templ_block, result_data;
+    };
+
+    void ConvolutionImpl::create(Size image_size, Size templ_size)
+    {
+        result_size = Size(image_size.width - templ_size.width + 1,
+                           image_size.height - templ_size.height + 1);
+
+        block_size = user_block_size;
+        if (user_block_size.width == 0 || user_block_size.height == 0)
+            block_size = estimateBlockSize(result_size);
+
+        dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
+        dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
+
+        // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
+        // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
+        if (dft_size.width > 8192)
+            dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1);
+        if (dft_size.height > 8192)
+            dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1);
+
+        // To avoid wasting time doing small DFTs
+        dft_size.width = std::max(dft_size.width, 512);
+        dft_size.height = std::max(dft_size.height, 512);
+
+        createContinuous(dft_size, CV_32F, image_block);
+        createContinuous(dft_size, CV_32F, templ_block);
+        createContinuous(dft_size, CV_32F, result_data);
+
+        spect_len = dft_size.height * (dft_size.width / 2 + 1);
+        createContinuous(1, spect_len, CV_32FC2, image_spect);
+        createContinuous(1, spect_len, CV_32FC2, templ_spect);
+        createContinuous(1, spect_len, CV_32FC2, result_spect);
+
+        // Use maximum result matrix block size for the estimated DFT block size
+        block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
+        block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
+    }
+
+    Size ConvolutionImpl::estimateBlockSize(Size result_size)
+    {
+        int width = (result_size.width + 2) / 3;
+        int height = (result_size.height + 2) / 3;
+        width = std::min(width, result_size.width);
+        height = std::min(height, result_size.height);
+        return Size(width, height);
+    }
+
+    void ConvolutionImpl::convolve(InputArray _image, InputArray _templ, OutputArray _result, bool ccorr, Stream& _stream)
+    {
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.type() == CV_32FC1 );
+        CV_Assert( templ.type() == CV_32FC1 );
+
+        create(image.size(), templ.size());
+
+        _result.create(result_size, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        cufftHandle planR2C, planC2R;
+        cufftSafeCall( cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R) );
+        cufftSafeCall( cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C) );
+
+        cufftSafeCall( cufftSetStream(planR2C, stream) );
+        cufftSafeCall( cufftSetStream(planC2R, stream) );
+
+        GpuMat templ_roi(templ.size(), CV_32FC1, templ.data, templ.step);
+        cuda::copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
+                            templ_block.cols - templ_roi.cols, 0, Scalar(), _stream);
+
+        cufftSafeCall( cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), templ_spect.ptr<cufftComplex>()) );
+
+        // Process all blocks of the result matrix
+        for (int y = 0; y < result.rows; y += block_size.height)
+        {
+            for (int x = 0; x < result.cols; x += block_size.width)
+            {
+                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                    std::min(y + dft_size.height, image.rows) - y);
+                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x),
+                                 image.step);
+                cuda::copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                                    0, image_block.cols - image_roi.cols, 0, Scalar(), _stream);
+
+                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(),
+                                           image_spect.ptr<cufftComplex>()));
+                cuda::mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                                          1.f / dft_size.area(), ccorr, _stream);
+                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(),
+                                           result_data.ptr<cufftReal>()));
+
+                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                     std::min(y + block_size.height, result.rows) - y);
+                GpuMat result_roi(result_roi_size, result.type(),
+                                  (void*)(result.ptr<float>(y) + x), result.step);
+                GpuMat result_block(result_roi_size, result_data.type(),
+                                    result_data.ptr(), result_data.step);
+
+                result_block.copyTo(result_roi, _stream);
+            }
+        }
+
+        cufftSafeCall( cufftDestroy(planR2C) );
+        cufftSafeCall( cufftDestroy(planC2R) );
+    }
+}
+
+#endif
+
+Ptr<Convolution> cv::cuda::createConvolution(Size user_block_size)
+{
+#ifndef HAVE_CUFFT
+    (void) user_block_size;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<Convolution>();
+#else
+    return new ConvolutionImpl(user_block_size);
+#endif
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaarithm/src/core.cpp
+++ b/modules/cudaarithm/src/core.cpp
@@ -0,0 +1,612 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::merge(const GpuMat*, size_t, OutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::merge(const std::vector<GpuMat>&, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::split(InputArray, GpuMat*, Stream&) { throw_no_cuda(); }
+void cv::cuda::split(InputArray, std::vector<GpuMat>&, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::transpose(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::flip(InputArray, OutputArray, int, Stream&) { throw_no_cuda(); }
+
+Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray) { throw_no_cuda(); return Ptr<LookUpTable>(); }
+
+void cv::cuda::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int, Scalar, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// merge/split
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace split_merge
+    {
+        void merge(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
+        void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
+    }
+}}}
+
+namespace
+{
+    void merge_caller(const GpuMat* src, size_t n, OutputArray _dst, Stream& stream)
+    {
+        CV_Assert( src != 0 );
+        CV_Assert( n > 0 && n <= 4 );
+
+        const int depth = src[0].depth();
+        const Size size = src[0].size();
+
+        for (size_t i = 0; i < n; ++i)
+        {
+            CV_Assert( src[i].size() == size );
+            CV_Assert( src[i].depth() == depth );
+            CV_Assert( src[i].channels() == 1 );
+        }
+
+        if (depth == CV_64F)
+        {
+            if (!deviceSupports(NATIVE_DOUBLE))
+                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        if (n == 1)
+        {
+            src[0].copyTo(_dst, stream);
+        }
+        else
+        {
+            _dst.create(size, CV_MAKE_TYPE(depth, (int)n));
+            GpuMat dst = _dst.getGpuMat();
+
+            PtrStepSzb src_as_devmem[4];
+            for(size_t i = 0; i < n; ++i)
+                src_as_devmem[i] = src[i];
+
+            PtrStepSzb dst_as_devmem(dst);
+            cv::cuda::device::split_merge::merge(src_as_devmem, dst_as_devmem, (int)n, CV_ELEM_SIZE(depth), StreamAccessor::getStream(stream));
+        }
+    }
+
+    void split_caller(const GpuMat& src, GpuMat* dst, Stream& stream)
+    {
+        CV_Assert( dst != 0 );
+
+        const int depth = src.depth();
+        const int num_channels = src.channels();
+
+        CV_Assert( num_channels <= 4 );
+
+        if (depth == CV_64F)
+        {
+            if (!deviceSupports(NATIVE_DOUBLE))
+                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        }
+
+        if (num_channels == 1)
+        {
+            src.copyTo(dst[0], stream);
+            return;
+        }
+
+        for (int i = 0; i < num_channels; ++i)
+            dst[i].create(src.size(), depth);
+
+        PtrStepSzb dst_as_devmem[4];
+        for (int i = 0; i < num_channels; ++i)
+            dst_as_devmem[i] = dst[i];
+
+        PtrStepSzb src_as_devmem(src);
+        cv::cuda::device::split_merge::split(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), StreamAccessor::getStream(stream));
+    }
+}
+
+void cv::cuda::merge(const GpuMat* src, size_t n, OutputArray dst, Stream& stream)
+{
+    merge_caller(src, n, dst, stream);
+}
+
+
+void cv::cuda::merge(const std::vector<GpuMat>& src, OutputArray dst, Stream& stream)
+{
+    merge_caller(&src[0], src.size(), dst, stream);
+}
+
+void cv::cuda::split(InputArray _src, GpuMat* dst, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    split_caller(src, dst, stream);
+}
+
+void cv::cuda::split(InputArray _src, std::vector<GpuMat>& dst, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    dst.resize(src.channels());
+    if(src.channels() > 0)
+        split_caller(src, &dst[0], stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+// transpose
+
+namespace arithm
+{
+    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
+}
+
+void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
+
+    _dst.create( src.cols, src.rows, src.type() );
+    GpuMat dst = _dst.getGpuMat();
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    if (src.elemSize() == 1)
+    {
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    else if (src.elemSize() == 4)
+    {
+        arithm::transpose<int>(src, dst, stream);
+    }
+    else // if (src.elemSize() == 8)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+
+        arithm::transpose<double>(src, dst, stream);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// flip
+
+namespace
+{
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
+
+    template <int DEPTH> struct NppMirrorFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+    };
+
+    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
+    {
+        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize sz;
+            sz.width  = src.cols;
+            sz.height = src.rows;
+
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
+        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
+    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// LUT
+
+#if (CUDA_VERSION >= 5000)
+
+namespace
+{
+    class LookUpTableImpl : public LookUpTable
+    {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        int nValues3[3];
+        const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];
+
+        GpuMat d_pLevels;
+        GpuMat d_nppLut;
+        GpuMat d_nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        Npp32s pLevels[256];
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+
+        d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+
+        GpuMat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = _lut.getGpuMat();
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut.upload(Mat(1, 256, hLut.type(), hLut.data));
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(d_nppLut, CV_32S);
+
+        if (lut_cn == 1)
+        {
+            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
+        }
+        else
+        {
+            cuda::split(d_nppLut, d_nppLut3);
+
+            pValues3[0] = d_nppLut3[0].ptr<Npp32s>();
+            pValues3[1] = d_nppLut3[1].ptr<Npp32s>();
+            pValues3[2] = d_nppLut3[2].ptr<Npp32s>();
+        }
+    }
+
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#else //  (CUDA_VERSION >= 5000)
+
+namespace
+{
+    class LookUpTableImpl : public LookUpTable
+    {
+    public:
+        LookUpTableImpl(InputArray lut);
+
+        void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        int lut_cn;
+
+        Npp32s pLevels[256];
+        int nValues3[3];
+        const Npp32s* pValues3[3];
+        const Npp32s* pLevels3[3];
+
+        Mat nppLut;
+        Mat nppLut3[3];
+    };
+
+    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
+    {
+        nValues3[0] = nValues3[1] = nValues3[2] = 256;
+
+        for (int i = 0; i < 256; ++i)
+            pLevels[i] = i;
+        pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+
+        Mat lut;
+        if (_lut.kind() == _InputArray::GPU_MAT)
+        {
+            lut = Mat(_lut.getGpuMat());
+        }
+        else
+        {
+            Mat hLut = _lut.getMat();
+            CV_Assert( hLut.total() == 256 && hLut.isContinuous() );
+            lut = hLut;
+        }
+
+        lut_cn = lut.channels();
+
+        CV_Assert( lut.depth() == CV_8U );
+        CV_Assert( lut.rows == 1 && lut.cols == 256 );
+
+        lut.convertTo(nppLut, CV_32S);
+
+        if (lut_cn == 1)
+        {
+            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
+        }
+        else
+        {
+            cv::split(nppLut, nppLut3);
+
+            pValues3[0] = nppLut3[0].ptr<Npp32s>();
+            pValues3[1] = nppLut3[1].ptr<Npp32s>();
+            pValues3[2] = nppLut3[2].ptr<Npp32s>();
+        }
+    }
+
+    void LookUpTableImpl::transform(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        const int cn = src.channels();
+
+        CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+        CV_Assert( lut_cn == 1 || lut_cn == cn );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.height = src.rows;
+        sz.width = src.cols;
+
+        if (src.type() == CV_8UC1)
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
+        }
+        else
+        {
+            nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif //  (CUDA_VERSION >= 5000)
+
+Ptr<LookUpTable> cv::cuda::createLookUpTable(InputArray lut)
+{
+    return new LookUpTableImpl(lut);
+}
+
+////////////////////////////////////////////////////////////////////////
+// copyMakeBorder
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    template <typename T, int cn> void copyMakeBorder_caller(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
+    {
+        using namespace ::cv::cuda::device::imgproc;
+
+        Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));
+
+        copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
+    }
+}
+
+#if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
+typedef Npp32s __attribute__((__may_alias__)) Npp32s_a;
+#else
+typedef Npp32s Npp32s_a;
+#endif
+
+void cv::cuda::copyMakeBorder(InputArray _src, OutputArray _dst, int top, int bottom, int left, int right, int borderType, Scalar value, Stream& _stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( borderType == BORDER_REFLECT_101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
+
+    _dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    if (borderType == BORDER_CONSTANT && (src.type() == CV_8UC1 || src.type() == CV_8UC4 || src.type() == CV_32SC1 || src.type() == CV_32FC1))
+    {
+        NppiSize srcsz;
+        srcsz.width  = src.cols;
+        srcsz.height = src.rows;
+
+        NppiSize dstsz;
+        dstsz.width  = dst.cols;
+        dstsz.height = dst.rows;
+
+        NppStreamHandler h(stream);
+
+        switch (src.type())
+        {
+        case CV_8UC1:
+            {
+                Npp8u nVal = saturate_cast<Npp8u>(value[0]);
+                nppSafeCall( nppiCopyConstBorder_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        case CV_8UC4:
+            {
+                Npp8u nVal[] = {saturate_cast<Npp8u>(value[0]), saturate_cast<Npp8u>(value[1]), saturate_cast<Npp8u>(value[2]), saturate_cast<Npp8u>(value[3])};
+                nppSafeCall( nppiCopyConstBorder_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        case CV_32SC1:
+            {
+                Npp32s nVal = saturate_cast<Npp32s>(value[0]);
+                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        case CV_32FC1:
+            {
+                Npp32f val = saturate_cast<Npp32f>(value[0]);
+                Npp32s nVal = *(reinterpret_cast<Npp32s_a*>(&val));
+                nppSafeCall( nppiCopyConstBorder_32s_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), srcsz,
+                    dst.ptr<Npp32s>(), static_cast<int>(dst.step), dstsz, top, left, nVal) );
+                break;
+            }
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+    else
+    {
+        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
+        static const caller_t callers[6][4] =
+        {
+            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
+            {0/*copyMakeBorder_caller<schar, 1>*/, 0/*copyMakeBorder_caller<schar, 2>*/ , 0/*copyMakeBorder_caller<schar, 3>*/, 0/*copyMakeBorder_caller<schar, 4>*/},
+            {   copyMakeBorder_caller<ushort, 1> , 0/*copyMakeBorder_caller<ushort, 2>*/,    copyMakeBorder_caller<ushort, 3> ,    copyMakeBorder_caller<ushort, 4>},
+            {   copyMakeBorder_caller<short, 1>  , 0/*copyMakeBorder_caller<short, 2>*/ ,    copyMakeBorder_caller<short, 3>  ,    copyMakeBorder_caller<short, 4>},
+            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
+            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
+        };
+
+        caller_t func = callers[src.depth()][src.channels() - 1];
+        CV_Assert(func != 0);
+
+        func(src, dst, top, left, borderType, value, stream);
+    }
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaarithm/src/cuda/absdiff_mat.cu
+++ b/modules/cudaarithm/src/cuda/absdiff_mat.cu
@@ -0,0 +1,147 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    struct VAbsDiff4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VAbsDiff4() {}
+        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
+    };
+
+    struct VAbsDiff2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vabsdiff2(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VAbsDiff2() {}
+        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
+    };
+
+    __device__ __forceinline__ int _abs(int a)
+    {
+        return ::abs(a);
+    }
+    __device__ __forceinline__ float _abs(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ double _abs(double a)
+    {
+        return ::fabs(a);
+    }
+
+    template <typename T> struct AbsDiffMat : binary_function<T, T, T>
+    {
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return saturate_cast<T>(_abs(a - b));
+        }
+
+        __host__ __device__ __forceinline__ AbsDiffMat() {}
+        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< arithm::AbsDiffMat<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
+    }
+
+    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
+    }
+
+    template <typename T>
+    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
+    }
+
+    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/absdiff_scalar.cu
+++ b/modules/cudaarithm/src/cuda/absdiff_scalar.cu
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
+    {
+        S val;
+
+        __host__ explicit AbsDiffScalar(S val_) : val(val_) {}
+
+        __device__ __forceinline__ T operator ()(T a) const
+        {
+            abs_func<S> f;
+            return saturate_cast<T>(f(a - val));
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename S> struct TransformFunctorTraits< arithm::AbsDiffScalar<T, S> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S>
+    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    {
+        AbsDiffScalar<T, S> op(static_cast<S>(val));
+
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
+    }
+
+    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/add_mat.cu
+++ b/modules/cudaarithm/src/cuda/add_mat.cu
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    struct VAdd4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VAdd4() {}
+        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
+    };
+
+    struct VAdd2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vadd2(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VAdd2() {}
+        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
+    };
+
+    template <typename T, typename D> struct AddMat : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a + b);
+        }
+
+        __host__ __device__ __forceinline__ AddMat() {}
+        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::AddMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
+    }
+
+    void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename D>
+    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
+    }
+
+    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/add_scalar.cu
+++ b/modules/cudaarithm/src/cuda/add_scalar.cu
@@ -0,0 +1,148 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
+    {
+        S val;
+
+        __host__ explicit AddScalar(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(a + val);
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::AddScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        AddScalar<T, S, D> op(static_cast<S>(val));
+
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/add_weighted.cu
+++ b/modules/cudaarithm/src/cuda/add_weighted.cu
@@ -0,0 +1,364 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    template <typename T> struct UseDouble_
+    {
+        enum {value = 0};
+    };
+    template <> struct UseDouble_<double>
+    {
+        enum {value = 1};
+    };
+    template <typename T1, typename T2, typename D> struct UseDouble
+    {
+        enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
+    };
+
+    template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
+    template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
+    {
+        float alpha;
+        float beta;
+        float gamma;
+
+        __host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
+
+        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+        {
+            return saturate_cast<D>(a * alpha + b * beta + gamma);
+        }
+    };
+    template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
+    {
+        double alpha;
+        double beta;
+        double gamma;
+
+        __host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+
+        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+        {
+            return saturate_cast<D>(a * alpha + b * beta + gamma);
+        }
+    };
+    template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
+    {
+        AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T1, typename T2, typename D, size_t src1_size, size_t src2_size, size_t dst_size> struct AddWeightedTraits : DefaultTransformFunctorTraits< arithm::AddWeighted<T1, T2, D> >
+    {
+    };
+    template <typename T1, typename T2, typename D, size_t src_size, size_t dst_size> struct AddWeightedTraits<T1, T2, D, src_size, src_size, dst_size> : arithm::ArithmFuncTraits<src_size, dst_size>
+    {
+    };
+
+    template <typename T1, typename T2, typename D> struct TransformFunctorTraits< arithm::AddWeighted<T1, T2, D> > : AddWeightedTraits<T1, T2, D, sizeof(T1), sizeof(T2), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T1, typename T2, typename D>
+    void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
+    {
+        AddWeighted<T1, T2, D> op(alpha, beta, gamma);
+
+        device::transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+    template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+    template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+    template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+    template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+    template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+    template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+
+
+
+    template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaarithm/src/cuda/arithm_func_traits.hpp
+++ b/modules/cudaarithm/src/cuda/arithm_func_traits.hpp
@@ -0,0 +1,145 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __ARITHM_FUNC_TRAITS_HPP__
+#define __ARITHM_FUNC_TRAITS_HPP__
+
+#include <cstddef>
+
+namespace arithm
+{
+    template <size_t src_size, size_t dst_size> struct ArithmFuncTraits
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 1 };
+    };
+
+    template <> struct ArithmFuncTraits<1, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<1, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<1, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct ArithmFuncTraits<2, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<2, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<2, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct ArithmFuncTraits<4, 1>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<4, 2>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct ArithmFuncTraits<4, 4>
+    {
+        enum { simple_block_dim_x = 32 };
+        enum { simple_block_dim_y = 8 };
+
+        enum { smart_block_dim_x = 32 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}
+
+#endif // __ARITHM_FUNC_TRAITS_HPP__
--- a/modules/cudaarithm/src/cuda/bitwise_mat.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_mat.cu
@@ -0,0 +1,126 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< bit_not<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< bit_and<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< bit_or<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< bit_xor<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
+    }
+
+    template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/bitwise_scalar.cu
+++ b/modules/cudaarithm/src/cuda/bitwise_scalar.cu
@@ -0,0 +1,104 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< binder2nd< bit_and<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< bit_or<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< bit_xor<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
+    }
+
+    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/cmp_mat.cu
+++ b/modules/cudaarithm/src/cuda/cmp_mat.cu
@@ -0,0 +1,206 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    struct VCmpEq4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpeq4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VCmpEq4() {}
+        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
+    };
+    struct VCmpNe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpne4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VCmpNe4() {}
+        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
+    };
+    struct VCmpLt4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmplt4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VCmpLt4() {}
+        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
+    };
+    struct VCmpLe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmple4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VCmpLe4() {}
+        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
+    };
+
+    template <class Op, typename T>
+    struct Cmp : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T a, T b) const
+        {
+            Op op;
+            return -op(a, b);
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
+    }
+    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
+    }
+    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
+    }
+    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
+    }
+
+    template <template <typename> class Op, typename T>
+    void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        Cmp<Op<T>, T> op;
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
+    }
+
+    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        cmpMat<equal_to, T>(src1, src2, dst, stream);
+    }
+    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        cmpMat<not_equal_to, T>(src1, src2, dst, stream);
+    }
+    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        cmpMat<less, T>(src1, src2, dst, stream);
+    }
+    template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        cmpMat<less_equal, T>(src1, src2, dst, stream);
+    }
+
+    template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/cmp_scalar.cu
+++ b/modules/cudaarithm/src/cuda/cmp_scalar.cu
@@ -0,0 +1,284 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    template <class Op, typename T>
+    struct Cmp : binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T a, T b) const
+        {
+            Op op;
+            return -op(a, b);
+        }
+    };
+
+#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
+
+    template <class Op, typename T, int cn> struct CmpScalar;
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
+    {
+        T val;
+
+        __host__ explicit CmpScalar(T val_) : val(val_) {}
+
+        __device__ __forceinline__ uchar operator()(T src) const
+        {
+            Cmp<Op, T> op;
+            return op(src, val);
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
+    {
+        TYPE_VEC(T, 2) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
+    {
+        TYPE_VEC(T, 3) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
+    {
+        TYPE_VEC(T, 4) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
+        }
+    };
+
+#undef TYPE_VEC
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <template <typename> class Op, typename T, int cn>
+    void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_t;
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
+        src_t val1 = VecTraits<src_t>::make(sval);
+
+        CmpScalar<Op<T>, T, cn> op(val1);
+        device::transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
+    }
+
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<equal_to, T, 1>,
+            cmpScalar<equal_to, T, 2>,
+            cmpScalar<equal_to, T, 3>,
+            cmpScalar<equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<not_equal_to, T, 1>,
+            cmpScalar<not_equal_to, T, 2>,
+            cmpScalar<not_equal_to, T, 3>,
+            cmpScalar<not_equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<less, T, 1>,
+            cmpScalar<less, T, 2>,
+            cmpScalar<less, T, 3>,
+            cmpScalar<less, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<less_equal, T, 1>,
+            cmpScalar<less_equal, T, 2>,
+            cmpScalar<less_equal, T, 3>,
+            cmpScalar<less_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<greater, T, 1>,
+            cmpScalar<greater, T, 2>,
+            cmpScalar<greater, T, 3>,
+            cmpScalar<greater, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<greater_equal, T, 1>,
+            cmpScalar<greater_equal, T, 2>,
+            cmpScalar<greater_equal, T, 3>,
+            cmpScalar<greater_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+
+    template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/copy_make_border.cu
+++ b/modules/cudaarithm/src/cuda/copy_make_border.cu
@@ -0,0 +1,131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = src(y - top, x - left);
+        }
+
+        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+        {
+            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
+                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
+                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
+
+                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
+            const T* borderValue, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type vec_type;
+
+            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+
+            static const caller_t callers[5] =
+            {
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call
+            };
+
+            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
+        }
+
+        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaarithm/src/cuda/countnonzero.cu
+++ b/modules/cudaarithm/src/cuda/countnonzero.cu
@@ -0,0 +1,176 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace countNonZero
+{
+    __device__ unsigned int blocks_finished = 0;
+
+    template <int BLOCK_SIZE, typename T>
+    __global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
+    {
+        __shared__ unsigned int scount[BLOCK_SIZE];
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        unsigned int mycount = 0;
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const T* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                const T srcVal = ptr[x];
+
+                mycount += (srcVal != 0);
+            }
+        }
+
+        device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
+
+    #if __CUDA_ARCH__ >= 200
+        if (tid == 0)
+            ::atomicAdd(count, mycount);
+    #else
+        __shared__ bool is_last;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (tid == 0)
+        {
+            count[bid] = mycount;
+
+            __threadfence();
+
+            unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
+            is_last = (ticket == gridDim.x * gridDim.y - 1);
+        }
+
+        __syncthreads();
+
+        if (is_last)
+        {
+            mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
+
+            device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
+
+            if (tid == 0)
+            {
+                count[0] = mycount;
+
+                blocks_finished = 0;
+            }
+        }
+    #endif
+    }
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
+
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        bufcols = grid.x * grid.y * sizeof(int);
+        bufrows = 1;
+    }
+
+    template <typename T>
+    int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
+    {
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        unsigned int* count_buf = buf.ptr(0);
+
+        cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );
+
+        kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        unsigned int count;
+        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
+
+        return count;
+    }
+
+    template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/div_mat.cu
+++ b/modules/cudaarithm/src/cuda/div_mat.cu
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    struct Div_8uc4_32f : binary_function<uint, float, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, float b) const
+        {
+            uint res = 0;
+
+            if (b != 0)
+            {
+                b = 1.0f / b;
+                res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );
+                res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);
+                res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
+                res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+            }
+
+            return res;
+        }
+    };
+
+    struct Div_16sc4_32f : binary_function<short4, float, short4>
+    {
+        __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+        {
+            return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
+                                        saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
+                          : make_short4(0,0,0,0);
+        }
+    };
+
+    template <typename T, typename D> struct Div : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(a / b) : 0;
+        }
+
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
+    };
+    template <typename T> struct Div<T, float> : binary_function<T, T, float>
+    {
+        __device__ __forceinline__ float operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<float>(a) / b : 0;
+        }
+
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
+    };
+    template <typename T> struct Div<T, double> : binary_function<T, T, double>
+    {
+        __device__ __forceinline__ double operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<double>(a) / b : 0;
+        }
+
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
+    };
+
+    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
+    {
+        S scale;
+
+        __host__ explicit DivScale(S scale_) : scale(scale_) {}
+
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<arithm::Div_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::Div<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
+    }
+
+    void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename S, typename D>
+    void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
+    {
+        if (scale == 1)
+        {
+            Div<T, D> op;
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            DivScale<T, S, D> op(static_cast<S>(scale));
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+    }
+
+    template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/div_scalar.cu
+++ b/modules/cudaarithm/src/cuda/div_scalar.cu
@@ -0,0 +1,168 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct DivScalar : unary_function<T, D>
+    {
+        S val;
+
+        __host__ explicit DivScalar(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(a / val);
+        }
+    };
+
+    template <typename T, typename S, typename D> struct DivScalarInv : unary_function<T, D>
+    {
+        S val;
+
+        explicit DivScalarInv(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return a != 0 ? saturate_cast<D>(val / a) : 0;
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalarInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream)
+    {
+        if (inv)
+        {
+            DivScalarInv<T, S, D> op(static_cast<S>(val));
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            DivScalar<T, S, D> op(static_cast<S>(val));
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+    }
+
+    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+
+    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/integral.cu
+++ b/modules/cudaarithm/src/cuda/integral.cu
@@ -0,0 +1,472 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        // Utility function to extract unsigned chars from an unsigned integer
+        __device__ uchar4 int_to_uchar4(unsigned int in)
+        {
+            uchar4 bytes;
+            bytes.x = (in & 0x000000ff) >>  0;
+            bytes.y = (in & 0x0000ff00) >>  8;
+            bytes.z = (in & 0x00ff0000) >> 16;
+            bytes.w = (in & 0xff000000) >> 24;
+            return bytes;
+        }
+
+        __global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ int sums[128];
+
+            const int id = threadIdx.x;
+            const int lane_id = id % warpSize;
+            const int warp_id = id / warpSize;
+
+            const uint4 data = img(blockIdx.x, id);
+
+            const uchar4 a = int_to_uchar4(data.x);
+            const uchar4 b = int_to_uchar4(data.y);
+            const uchar4 c = int_to_uchar4(data.z);
+            const uchar4 d = int_to_uchar4(data.w);
+
+            int result[16];
+
+            result[0]  =              a.x;
+            result[1]  = result[0]  + a.y;
+            result[2]  = result[1]  + a.z;
+            result[3]  = result[2]  + a.w;
+
+            result[4]  = result[3]  + b.x;
+            result[5]  = result[4]  + b.y;
+            result[6]  = result[5]  + b.z;
+            result[7]  = result[6]  + b.w;
+
+            result[8]  = result[7]  + c.x;
+            result[9]  = result[8]  + c.y;
+            result[10] = result[9]  + c.z;
+            result[11] = result[10] + c.w;
+
+            result[12] = result[11] + d.x;
+            result[13] = result[12] + d.y;
+            result[14] = result[13] + d.z;
+            result[15] = result[14] + d.w;
+
+            int sum = result[15];
+
+            // the prefix sum for each thread's 16 value is computed,
+            // now the final sums (result[15]) need to be shared
+            // with the other threads and add.  To do this,
+            // the __shfl_up() instruction is used and a shuffle scan
+            // operation is performed to distribute the sums to the correct
+            // threads
+            #pragma unroll
+            for (int i = 1; i < 32; i *= 2)
+            {
+                const int n = __shfl_up(sum, i, 32);
+
+                if (lane_id >= i)
+                {
+                    #pragma unroll
+                    for (int i = 0; i < 16; ++i)
+                        result[i] += n;
+
+                    sum += n;
+                }
+            }
+
+            // Now the final sum for the warp must be shared
+            // between warps.  This is done by each warp
+            // having a thread store to shared memory, then
+            // having some other warp load the values and
+            // compute a prefix sum, again by using __shfl_up.
+            // The results are uniformly added back to the warps.
+            // last thread in the warp holding sum of the warp
+            // places that in shared
+            if (threadIdx.x % warpSize == warpSize - 1)
+                sums[warp_id] = result[15];
+
+            __syncthreads();
+
+            if (warp_id == 0)
+            {
+                int warp_sum = sums[lane_id];
+
+                #pragma unroll
+                for (int i = 1; i <= 32; i *= 2)
+                {
+                    const int n = __shfl_up(warp_sum, i, 32);
+
+                    if (lane_id >= i)
+                        warp_sum += n;
+                }
+
+                sums[lane_id] = warp_sum;
+            }
+
+            __syncthreads();
+
+            int blockSum = 0;
+
+            // fold in unused warp
+            if (warp_id > 0)
+            {
+                blockSum = sums[warp_id - 1];
+
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += blockSum;
+            }
+
+            // assemble result
+            // Each thread has 16 values to write, which are
+            // now integer data (to avoid overflow).  Instead of
+            // each thread writing consecutive uint4s, the
+            // approach shown here experiments using
+            // the shuffle command to reformat the data
+            // inside the registers so that each thread holds
+            // consecutive data to be written so larger contiguous
+            // segments can be assembled for writing.
+
+            /*
+                For example data that needs to be written as
+
+                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+                but is stored in registers (r0..r3), in four threads (0..3) as:
+
+                threadId   0  1  2  3
+                  r0      x0 y0 z0 w0
+                  r1      x1 y1 z1 w1
+                  r2      x2 y2 z2 w2
+                  r3      x3 y3 z3 w3
+
+                  after apply __shfl_xor operations to move data between registers r1..r3:
+
+                threadId  00 01 10 11
+                          x0 y0 z0 w0
+                 xor(01)->y1 x1 w1 z1
+                 xor(10)->z2 w2 x2 y2
+                 xor(11)->w3 z3 y3 x3
+
+                 and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+                 In the current code, each register above is actually representing
+                 four integers to be written as uint4's to GMEM.
+            */
+
+            result[4]  = __shfl_xor(result[4] , 1, 32);
+            result[5]  = __shfl_xor(result[5] , 1, 32);
+            result[6]  = __shfl_xor(result[6] , 1, 32);
+            result[7]  = __shfl_xor(result[7] , 1, 32);
+
+            result[8]  = __shfl_xor(result[8] , 2, 32);
+            result[9]  = __shfl_xor(result[9] , 2, 32);
+            result[10] = __shfl_xor(result[10], 2, 32);
+            result[11] = __shfl_xor(result[11], 2, 32);
+
+            result[12] = __shfl_xor(result[12], 3, 32);
+            result[13] = __shfl_xor(result[13], 3, 32);
+            result[14] = __shfl_xor(result[14], 3, 32);
+            result[15] = __shfl_xor(result[15], 3, 32);
+
+            uint4* integral_row = integral.ptr(blockIdx.x);
+            uint4 output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+            // continuning from the above example,
+            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+            // in order.
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] = __shfl_xor(result[i], 1, 32);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+        #endif
+        }
+
+        // This kernel computes columnwise prefix sums.  When the data input is
+        // the row sums from above, this completes the integral image.
+        // The approach here is to have each block compute a local set of sums.
+        // First , the data covered by the block is loaded into shared memory,
+        // then instead of performing a sum in shared memory using __syncthreads
+        // between stages, the data is reformatted so that the necessary sums
+        // occur inside warps and the shuffle scan operation is used.
+        // The final set of sums from the block is then propgated, with the block
+        // computing "down" the image and adding the running sum to the local
+        // block sums.
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = integral.ptr(y) + tidx;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *p = sum;
+            }
+        #endif
+        }
+
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
+        {
+            {
+                // each thread handles 16 values, use 1 block/row
+                // save, becouse step is actually can't be less 512 bytes
+                int block = integral.cols / 16;
+
+                // launch 1 block / row
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = buffer.ptr(y) + tidx;
+                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *dst = sum;
+            }
+        #endif
+        }
+
+        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
+            int blockStep, cudaStream_t stream)
+        {
+            {
+                const int block = blockStep;
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaarithm/src/cuda/math.cu
+++ b/modules/cudaarithm/src/cuda/math.cu
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/type_traits.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+//////////////////////////////////////////////////////////////////////////
+// absMat
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< abs_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
+    }
+
+    template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrMat
+
+namespace arithm
+{
+    template <typename T> struct Sqr : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            return saturate_cast<T>(x * x);
+        }
+
+        __host__ __device__ __forceinline__ Sqr() {}
+        __host__ __device__ __forceinline__ Sqr(const Sqr&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::Sqr<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
+    }
+
+    template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrtMat
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< sqrt_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
+    }
+
+    template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// logMat
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< log_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
+    }
+
+    template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// expMat
+
+namespace arithm
+{
+    template <typename T> struct Exp : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            exp_func<T> f;
+            return saturate_cast<T>(f(x));
+        }
+
+        __host__ __device__ __forceinline__ Exp() {}
+        __host__ __device__ __forceinline__ Exp(const Exp&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::Exp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
+    }
+
+    template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// pow
+
+namespace arithm
+{
+    template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
+    {
+        float power;
+
+        __host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
+
+        __device__ __forceinline__ T operator()(T e) const
+        {
+            return saturate_cast<T>(__powf((float)e, power));
+        }
+    };
+    template<typename T> struct PowOp<T, true> : unary_function<T, T>
+    {
+        float power;
+
+        __host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
+
+        __device__ __forceinline__ T operator()(T e) const
+        {
+            T res = saturate_cast<T>(__powf((float)e, power));
+
+            if ((e < 0) && (1 & static_cast<int>(power)))
+                res *= -1;
+
+            return res;
+        }
+    };
+    template<> struct PowOp<float> : unary_function<float, float>
+    {
+        float power;
+
+        __host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}
+
+        __device__ __forceinline__ float operator()(float e) const
+        {
+            return __powf(::fabs(e), power);
+        }
+    };
+    template<> struct PowOp<double> : unary_function<double, double>
+    {
+        double power;
+
+        __host__ explicit PowOp(double power_) : power(power_) {}
+
+        __device__ __forceinline__ double operator()(double e) const
+        {
+            return ::pow(::fabs(e), power);
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< arithm::PowOp<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template<typename T>
+    void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
+    }
+
+    template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/minmax.cu
+++ b/modules/cudaarithm/src/cuda/minmax.cu
@@ -0,0 +1,247 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace minMax
+{
+    __device__ unsigned int blocks_finished = 0;
+
+    // To avoid shared bank conflicts we convert each value into value of
+    // appropriate type (32 bits minimum)
+    template <typename T> struct MinMaxTypeTraits;
+    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+
+    template <int BLOCK_SIZE, typename R>
+    struct GlobalReduce
+    {
+        static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
+        {
+        #if __CUDA_ARCH__ >= 200
+            if (tid == 0)
+            {
+                Emulation::glob::atomicMin(minval, mymin);
+                Emulation::glob::atomicMax(maxval, mymax);
+            }
+        #else
+            __shared__ bool is_last;
+
+            if (tid == 0)
+            {
+                minval[bid] = mymin;
+                maxval[bid] = mymax;
+
+                __threadfence();
+
+                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+                is_last = (ticket == gridDim.x * gridDim.y - 1);
+            }
+
+            __syncthreads();
+
+            if (is_last)
+            {
+                int idx = ::min(tid, gridDim.x * gridDim.y - 1);
+
+                mymin = minval[idx];
+                mymax = maxval[idx];
+
+                const minimum<R> minOp;
+                const maximum<R> maxOp;
+                device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
+
+                if (tid == 0)
+                {
+                    minval[0] = mymin;
+                    maxval[0] = mymax;
+
+                    blocks_finished = 0;
+                }
+            }
+        #endif
+        }
+    };
+
+    template <int BLOCK_SIZE, typename T, typename R, class Mask>
+    __global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
+    {
+        __shared__ R sminval[BLOCK_SIZE];
+        __shared__ R smaxval[BLOCK_SIZE];
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        R mymin = numeric_limits<R>::max();
+        R mymax = -numeric_limits<R>::max();
+
+        const minimum<R> minOp;
+        const maximum<R> maxOp;
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const T* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const R srcVal = ptr[x];
+
+                    mymin = minOp(mymin, srcVal);
+                    mymax = maxOp(mymax, srcVal);
+                }
+            }
+        }
+
+        device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));
+
+        GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
+    }
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
+
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        bufcols = grid.x * grid.y * sizeof(double);
+        bufrows = 2;
+    }
+
+    __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
+    {
+        *minval_buf = numeric_limits<int>::max();
+        *maxval_buf = numeric_limits<int>::min();
+    }
+    __global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
+    {
+        *minval_buf = numeric_limits<float>::max();
+        *maxval_buf = -numeric_limits<float>::max();
+    }
+    __global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
+    {
+        *minval_buf = numeric_limits<double>::max();
+        *maxval_buf = -numeric_limits<double>::max();
+    }
+
+    template <typename R>
+    void setDefault(R* minval_buf, R* maxval_buf)
+    {
+        setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
+    }
+
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type R;
+
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        R* minval_buf = (R*) buf.ptr(0);
+        R* maxval_buf = (R*) buf.ptr(1);
+
+        setDefault(minval_buf, maxval_buf);
+
+        if (mask.data)
+            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
+        else
+            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        R minval_, maxval_;
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
+        *minval = minval_;
+        *maxval = maxval_;
+    }
+
+    template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/minmax_mat.cu
+++ b/modules/cudaarithm/src/cuda/minmax_mat.cu
@@ -0,0 +1,228 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+//////////////////////////////////////////////////////////////////////////
+// min
+
+namespace arithm
+{
+    struct VMin4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmin4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VMin4() {}
+        __host__ __device__ __forceinline__ VMin4(const VMin4&) {}
+    };
+
+    struct VMin2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmin2(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VMin2() {}
+        __host__ __device__ __forceinline__ VMin2(const VMin2&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< minimum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
+    }
+
+    void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
+    }
+
+    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
+    }
+
+    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
+    }
+
+    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// max
+
+namespace arithm
+{
+    struct VMax4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmax4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VMax4() {}
+        __host__ __device__ __forceinline__ VMax4(const VMax4&) {}
+    };
+
+    struct VMax2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vmax2(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VMax2() {}
+        __host__ __device__ __forceinline__ VMax2(const VMax2&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< maximum<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
+    }
+
+    void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
+    }
+
+    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
+    }
+
+    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::cuda::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
+    }
+
+    template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/minmaxloc.cu
+++ b/modules/cudaarithm/src/cuda/minmaxloc.cu
@@ -0,0 +1,236 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace minMaxLoc
+{
+    // To avoid shared bank conflicts we convert each value into value of
+    // appropriate type (32 bits minimum)
+    template <typename T> struct MinMaxTypeTraits;
+    template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
+
+    template <int BLOCK_SIZE, typename T, class Mask>
+    __global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type work_type;
+
+        __shared__ work_type sminval[BLOCK_SIZE];
+        __shared__ work_type smaxval[BLOCK_SIZE];
+        __shared__ unsigned int sminloc[BLOCK_SIZE];
+        __shared__ unsigned int smaxloc[BLOCK_SIZE];
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        work_type mymin = numeric_limits<work_type>::max();
+        work_type mymax = -numeric_limits<work_type>::max();
+        unsigned int myminloc = 0;
+        unsigned int mymaxloc = 0;
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const T* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const work_type srcVal = ptr[x];
+
+                    if (srcVal < mymin)
+                    {
+                        mymin = srcVal;
+                        myminloc = y * src.cols + x;
+                    }
+
+                    if (srcVal > mymax)
+                    {
+                        mymax = srcVal;
+                        mymaxloc = y * src.cols + x;
+                    }
+                }
+            }
+        }
+
+        reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
+                                 smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
+                                 tid,
+                                 thrust::make_tuple(less<work_type>(), greater<work_type>()));
+
+        if (tid == 0)
+        {
+            minval[bid] = (T) mymin;
+            maxval[bid] = (T) mymax;
+            minloc[bid] = myminloc;
+            maxloc[bid] = mymaxloc;
+        }
+    }
+    template <int BLOCK_SIZE, typename T>
+    __global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
+    {
+        typedef typename MinMaxTypeTraits<T>::best_type work_type;
+
+        __shared__ work_type sminval[BLOCK_SIZE];
+        __shared__ work_type smaxval[BLOCK_SIZE];
+        __shared__ unsigned int sminloc[BLOCK_SIZE];
+        __shared__ unsigned int smaxloc[BLOCK_SIZE];
+
+        unsigned int idx = ::min(threadIdx.x, count - 1);
+
+        work_type mymin = minval[idx];
+        work_type mymax = maxval[idx];
+        unsigned int myminloc = minloc[idx];
+        unsigned int mymaxloc = maxloc[idx];
+
+        reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
+                                 smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
+                                 threadIdx.x,
+                                 thrust::make_tuple(less<work_type>(), greater<work_type>()));
+
+        if (threadIdx.x == 0)
+        {
+            minval[0] = (T) mymin;
+            maxval[0] = (T) mymax;
+            minloc[0] = myminloc;
+            maxloc[0] = mymaxloc;
+        }
+    }
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
+
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        // For values
+        b1cols = (int)(grid.x * grid.y * elem_size);
+        b1rows = 2;
+
+        // For locations
+        b2cols = grid.x * grid.y * sizeof(int);
+        b2rows = 2;
+    }
+
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
+    {
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        T* minval_buf = (T*) valbuf.ptr(0);
+        T* maxval_buf = (T*) valbuf.ptr(1);
+        unsigned int* minloc_buf = locbuf.ptr(0);
+        unsigned int* maxloc_buf = locbuf.ptr(1);
+
+        if (mask.data)
+            kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
+        else
+            kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        T minval_, maxval_;
+        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
+        *minval = minval_;
+        *maxval = maxval_;
+
+        unsigned int minloc_, maxloc_;
+        cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
+        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
+    }
+
+    template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/mul_mat.cu
+++ b/modules/cudaarithm/src/cuda/mul_mat.cu
@@ -0,0 +1,211 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    struct Mul_8uc4_32f : binary_function<uint, float, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, float b) const
+        {
+            uint res = 0;
+
+            res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );
+            res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);
+            res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
+            res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+
+            return res;
+        }
+
+        __host__ __device__ __forceinline__ Mul_8uc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
+    };
+
+    struct Mul_16sc4_32f : binary_function<short4, float, short4>
+    {
+        __device__ __forceinline__ short4 operator ()(short4 a, float b) const
+        {
+            return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
+                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
+        }
+
+        __host__ __device__ __forceinline__ Mul_16sc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
+    };
+
+    template <typename T, typename D> struct Mul : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a * b);
+        }
+
+        __host__ __device__ __forceinline__ Mul() {}
+        __host__ __device__ __forceinline__ Mul(const Mul&) {}
+    };
+
+    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
+    {
+        S scale;
+
+        __host__ explicit MulScale(S scale_) : scale(scale_) {}
+
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(scale * a * b);
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<arithm::Mul_8uc4_32f> : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::Mul<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScale<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
+    }
+
+    void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename S, typename D>
+    void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
+    {
+        if (scale == 1)
+        {
+            Mul<T, D> op;
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            MulScale<T, S, D> op(static_cast<S>(scale));
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+    }
+
+    template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/mul_scalar.cu
+++ b/modules/cudaarithm/src/cuda/mul_scalar.cu
@@ -0,0 +1,144 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
+    {
+        S val;
+
+        __host__ explicit MulScalar(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(a * val);
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::MulScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    {
+        MulScalar<T, S, D> op(static_cast<S>(val));
+        device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/mul_spectrums.cu
+++ b/modules/cudaarithm/src/cuda/mul_spectrums.cu
@@ -0,0 +1,171 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "cvconfig.h"
+
+#ifdef HAVE_CUFFT
+
+#include <cufft.h>
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    //////////////////////////////////////////////////////////////////////////
+    // mulSpectrums
+
+    __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x < c.cols && y < c.rows)
+        {
+            c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
+        }
+    }
+
+
+    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+
+        mulSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, c);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // mulSpectrums_CONJ
+
+    __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x < c.cols && y < c.rows)
+        {
+            c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
+        }
+    }
+
+
+    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+
+        mulSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, c);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // mulAndScaleSpectrums
+
+    __global__ void mulAndScaleSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x < c.cols && y < c.rows)
+        {
+            cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
+            c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
+        }
+    }
+
+
+    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+
+        mulAndScaleSpectrumsKernel<<<grid, threads, 0, stream>>>(a, b, scale, c);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // mulAndScaleSpectrums_CONJ
+
+    __global__ void mulAndScaleSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x < c.cols && y < c.rows)
+        {
+            cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
+            c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
+        }
+    }
+
+
+    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, PtrStepSz<cufftComplex> c, cudaStream_t stream)
+    {
+        dim3 threads(256);
+        dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
+
+        mulAndScaleSpectrumsKernel_CONJ<<<grid, threads, 0, stream>>>(a, b, scale, c);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif // HAVE_CUFFT
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaarithm/src/cuda/polar_cart.cu
+++ b/modules/cudaarithm/src/cuda/polar_cart.cu
@@ -0,0 +1,217 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace mathfunc
+    {
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Cart <-> Polar
+
+        struct Nothing
+        {
+            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
+            {
+            }
+        };
+        struct Magnitude
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
+            }
+        };
+        struct MagnitudeSqr
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
+            }
+        };
+        struct Atan2
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+            {
+                float angle = ::atan2f(y_data, x_data);
+                angle += (angle < 0) * 2.0f * CV_PI_F;
+                dst[y * dst_step + x] = scale * angle;
+            }
+        };
+        template <typename Mag, typename Angle>
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
+                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float x_data = xptr[y * x_step + x];
+                float y_data = yptr[y * y_step + x];
+
+                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
+                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
+            }
+        }
+
+        struct NonEmptyMag
+        {
+            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
+            {
+                return mag[y * mag_step + x];
+            }
+        };
+        struct EmptyMag
+        {
+            static __device__ __forceinline__ float get(const float*, size_t, int, int)
+            {
+                return 1.0f;
+            }
+        };
+        template <typename Mag>
+        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float mag_data = Mag::get(mag, mag_step, x, y);
+                float angle_data = angle[y * angle_step + x];
+                float sin_a, cos_a;
+
+                ::sincosf(scale * angle_data, &sin_a, &cos_a);
+
+                xptr[y * x_step + x] = mag_data * cos_a;
+                yptr[y * y_step + x] = mag_data * sin_a;
+            }
+        }
+
+        template <typename Mag, typename Angle>
+        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(x.cols, threads.x);
+            grid.y = divUp(x.rows, threads.y);
+
+            const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
+
+            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
+                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2][2][2] =
+            {
+                {
+                    {
+                        cartToPolar_caller<Magnitude, Atan2>,
+                        cartToPolar_caller<Magnitude, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<MagnitudeSqr, Atan2>,
+                        cartToPolar_caller<MagnitudeSqr, Nothing>,
+                    }
+                },
+                {
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>,
+                    }
+                }
+            };
+
+            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
+        }
+
+        template <typename Mag>
+        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(mag.cols, threads.x);
+            grid.y = divUp(mag.rows, threads.y);
+
+            const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
+
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
+                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2] =
+            {
+                polarToCart_caller<NonEmptyMag>,
+                polarToCart_caller<EmptyMag>
+            };
+
+            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
+        }
+    } // namespace mathfunc
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaarithm/src/cuda/reduce.cu
+++ b/modules/cudaarithm/src/cuda/reduce.cu
@@ -0,0 +1,340 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+#include "unroll_detail.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace reduce
+{
+    struct Sum
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(0);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return a + b;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T result(T r, int) const
+        {
+            return r;
+        }
+
+        __host__ __device__ __forceinline__ Sum() {}
+        __host__ __device__ __forceinline__ Sum(const Sum&) {}
+    };
+
+    template <typename T> struct OutputType
+    {
+        typedef float type;
+    };
+    template <> struct OutputType<double>
+    {
+        typedef double type;
+    };
+
+    struct Avg
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(0);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            return a + b;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ typename TypeVec<typename OutputType<typename VecTraits<T>::elem_type>::type, VecTraits<T>::cn>::vec_type result(T r, float sz) const
+        {
+            return r / sz;
+        }
+
+        __host__ __device__ __forceinline__ Avg() {}
+        __host__ __device__ __forceinline__ Avg(const Avg&) {}
+    };
+
+    struct Min
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            minimum<T> minOp;
+            return minOp(a, b);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T result(T r, int) const
+        {
+            return r;
+        }
+
+        __host__ __device__ __forceinline__ Min() {}
+        __host__ __device__ __forceinline__ Min(const Min&) {}
+    };
+
+    struct Max
+    {
+        template <typename T>
+        __device__ __forceinline__ T startValue() const
+        {
+            return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T operator ()(T a, T b) const
+        {
+            maximum<T> maxOp;
+            return maxOp(a, b);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ T result(T r, int) const
+        {
+            return r;
+        }
+
+        __host__ __device__ __forceinline__ Max() {}
+        __host__ __device__ __forceinline__ Max(const Max&) {}
+    };
+
+    ///////////////////////////////////////////////////////////
+
+    template <typename T, typename S, typename D, class Op>
+    __global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
+    {
+        __shared__ S smem[16 * 16];
+
+        const int x = blockIdx.x * 16 + threadIdx.x;
+
+        S myVal = op.template startValue<S>();
+
+        if (x < src.cols)
+        {
+            for (int y = threadIdx.y; y < src.rows; y += 16)
+            {
+                S srcVal = src(y, x);
+                myVal = op(myVal, srcVal);
+            }
+        }
+
+        smem[threadIdx.x * 16 + threadIdx.y] = myVal;
+
+        __syncthreads();
+
+        volatile S* srow = smem + threadIdx.y * 16;
+
+        myVal = srow[threadIdx.x];
+        device::reduce<16>(srow, myVal, threadIdx.x, op);
+
+        if (threadIdx.x == 0)
+            srow[0] = myVal;
+
+        __syncthreads();
+
+        if (threadIdx.y == 0 && x < src.cols)
+            dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
+    }
+
+    template <typename T, typename S, typename D, class Op>
+    void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(src.cols, block.x));
+
+        Op op;
+        rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <typename T, typename S, typename D>
+    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            rowsCaller<T, S, D, Sum>,
+            rowsCaller<T, S, D, Avg>,
+            rowsCaller<T, S, D, Max>,
+            rowsCaller<T, S, D, Min>
+        };
+
+        funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
+    }
+
+    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    ///////////////////////////////////////////////////////////
+
+    template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
+    __global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_type;
+        typedef typename TypeVec<S, cn>::vec_type work_type;
+        typedef typename TypeVec<D, cn>::vec_type dst_type;
+
+        __shared__ S smem[BLOCK_SIZE * cn];
+
+        const int y = blockIdx.x;
+
+        const src_type* srcRow = src.ptr(y);
+
+        work_type myVal = op.template startValue<work_type>();
+
+        for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
+            myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));
+
+        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));
+
+        if (threadIdx.x == 0)
+            dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
+    }
+
+    template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
+    {
+        const int BLOCK_SIZE = 256;
+
+        const dim3 block(BLOCK_SIZE);
+        const dim3 grid(src.rows);
+
+        Op op;
+        colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+    }
+
+    template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
+        static const func_t funcs[5][4] =
+        {
+            {0,0,0,0},
+            {colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
+            {colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
+            {colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
+            {colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
+        };
+
+        funcs[cn][op](src, dst, stream);
+    }
+
+    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+
+    template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaarithm/src/cuda/split_merge.cu
+++ b/modules/cudaarithm/src/cuda/split_merge.cu
@@ -0,0 +1,511 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace split_merge
+    {
+        template <typename T, size_t elem_size = sizeof(T)>
+        struct TypeTraits
+        {
+            typedef T type;
+            typedef T type2;
+            typedef T type3;
+            typedef T type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 1>
+        {
+            typedef char type;
+            typedef char2 type2;
+            typedef char3 type3;
+            typedef char4 type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 2>
+        {
+            typedef short type;
+            typedef short2 type2;
+            typedef short3 type3;
+            typedef short4 type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 4>
+        {
+            typedef int type;
+            typedef int2 type2;
+            typedef int3 type3;
+            typedef int4 type4;
+        };
+
+        template <typename T>
+        struct TypeTraits<T, 8>
+        {
+            typedef double type;
+            typedef double2 type2;
+            //typedef double3 type3;
+            //typedef double4 type3;
+        };
+
+        typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
+        typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
+
+        //------------------------------------------------------------
+        // Merge
+
+        template <typename T>
+        __global__ void mergeC2_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type2 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows)
+            {
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <typename T>
+        __global__ void mergeC3_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type3 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            const T* src2_y = (const T*)(src2 + y * src2_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows)
+            {
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_elem.z = src2_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <>
+        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src0_y = (const double*)(src0 + y * src0_step);
+            const double* src1_y = (const double*)(src1 + y * src1_step);
+            const double* src2_y = (const double*)(src2 + y * src2_step);
+            double* dst_y = (double*)(dst + y * dst_step);
+
+            if (x < cols && y < rows)
+            {
+                dst_y[3 * x] = src0_y[x];
+                dst_y[3 * x + 1] = src1_y[x];
+                dst_y[3 * x + 2] = src2_y[x];
+            }
+        }
+
+
+        template <typename T>
+        __global__ void mergeC4_(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 const uchar* src3, size_t src3_step,
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            typedef typename TypeTraits<T>::type4 dst_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const T* src0_y = (const T*)(src0 + y * src0_step);
+            const T* src1_y = (const T*)(src1 + y * src1_step);
+            const T* src2_y = (const T*)(src2 + y * src2_step);
+            const T* src3_y = (const T*)(src3 + y * src3_step);
+            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+            if (x < cols && y < rows)
+            {
+                dst_type dst_elem;
+                dst_elem.x = src0_y[x];
+                dst_elem.y = src1_y[x];
+                dst_elem.z = src2_y[x];
+                dst_elem.w = src3_y[x];
+                dst_y[x] = dst_elem;
+            }
+        }
+
+
+        template <>
+        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
+                                 const uchar* src1, size_t src1_step,
+                                 const uchar* src2, size_t src2_step,
+                                 const uchar* src3, size_t src3_step,
+                                 int rows, int cols, uchar* dst, size_t dst_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src0_y = (const double*)(src0 + y * src0_step);
+            const double* src1_y = (const double*)(src1 + y * src1_step);
+            const double* src2_y = (const double*)(src2 + y * src2_step);
+            const double* src3_y = (const double*)(src3 + y * src3_step);
+            double2* dst_y = (double2*)(dst + y * dst_step);
+
+            if (x < cols && y < rows)
+            {
+                dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
+                dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
+            }
+        }
+
+
+        template <typename T>
+        static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC2_<T><<<grid, block, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC3_<T><<<grid, block, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    src[2].data, src[2].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+            mergeC4_<T><<<grid, block, 0, stream>>>(
+                    src[0].data, src[0].step,
+                    src[1].data, src[1].step,
+                    src[2].data, src[2].step,
+                    src[3].data, src[3].step,
+                    dst.rows, dst.cols, dst.data, dst.step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void merge(const PtrStepSzb* src, PtrStepSzb& dst,
+                                     int total_channels, size_t elem_size,
+                                     const cudaStream_t& stream)
+        {
+            static MergeFunction merge_func_tbl[] =
+            {
+                mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
+                mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
+                mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
+            };
+
+            size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
+            MergeFunction merge_func = merge_func_tbl[merge_func_id];
+
+            if (merge_func == 0)
+                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
+
+            merge_func(src, dst, stream);
+        }
+
+
+
+        //------------------------------------------------------------
+        // Split
+
+
+        template <typename T>
+        __global__ void splitC2_(const uchar* src, size_t src_step,
+                                int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step)
+        {
+            typedef typename TypeTraits<T>::type2 src_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+
+            if (x < cols && y < rows)
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+            }
+        }
+
+
+        template <typename T>
+        __global__ void splitC3_(const uchar* src, size_t src_step,
+                                int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step,
+                                uchar* dst2, size_t dst2_step)
+        {
+            typedef typename TypeTraits<T>::type3 src_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+            T* dst2_y = (T*)(dst2 + y * dst2_step);
+
+            if (x < cols && y < rows)
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+                dst2_y[x] = src_elem.z;
+            }
+        }
+
+
+        template <>
+        __global__ void splitC3_<double>(
+                const uchar* src, size_t src_step, int rows, int cols,
+                uchar* dst0, size_t dst0_step,
+                uchar* dst1, size_t dst1_step,
+                uchar* dst2, size_t dst2_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double* src_y = (const double*)(src + y * src_step);
+            double* dst0_y = (double*)(dst0 + y * dst0_step);
+            double* dst1_y = (double*)(dst1 + y * dst1_step);
+            double* dst2_y = (double*)(dst2 + y * dst2_step);
+
+            if (x < cols && y < rows)
+            {
+                dst0_y[x] = src_y[3 * x];
+                dst1_y[x] = src_y[3 * x + 1];
+                dst2_y[x] = src_y[3 * x + 2];
+            }
+        }
+
+
+        template <typename T>
+        __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
+                                uchar* dst0, size_t dst0_step,
+                                uchar* dst1, size_t dst1_step,
+                                uchar* dst2, size_t dst2_step,
+                                uchar* dst3, size_t dst3_step)
+        {
+            typedef typename TypeTraits<T>::type4 src_type;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const src_type* src_y = (const src_type*)(src + y * src_step);
+            T* dst0_y = (T*)(dst0 + y * dst0_step);
+            T* dst1_y = (T*)(dst1 + y * dst1_step);
+            T* dst2_y = (T*)(dst2 + y * dst2_step);
+            T* dst3_y = (T*)(dst3 + y * dst3_step);
+
+            if (x < cols && y < rows)
+            {
+                src_type src_elem = src_y[x];
+                dst0_y[x] = src_elem.x;
+                dst1_y[x] = src_elem.y;
+                dst2_y[x] = src_elem.z;
+                dst3_y[x] = src_elem.w;
+            }
+        }
+
+
+        template <>
+        __global__ void splitC4_<double>(
+                const uchar* src, size_t src_step, int rows, int cols,
+                uchar* dst0, size_t dst0_step,
+                uchar* dst1, size_t dst1_step,
+                uchar* dst2, size_t dst2_step,
+                uchar* dst3, size_t dst3_step)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const double2* src_y = (const double2*)(src + y * src_step);
+            double* dst0_y = (double*)(dst0 + y * dst0_step);
+            double* dst1_y = (double*)(dst1 + y * dst1_step);
+            double* dst2_y = (double*)(dst2 + y * dst2_step);
+            double* dst3_y = (double*)(dst3 + y * dst3_step);
+
+            if (x < cols && y < rows)
+            {
+                double2 src_elem1 = src_y[2 * x];
+                double2 src_elem2 = src_y[2 * x + 1];
+                dst0_y[x] = src_elem1.x;
+                dst1_y[x] = src_elem1.y;
+                dst2_y[x] = src_elem2.x;
+                dst3_y[x] = src_elem2.y;
+            }
+        }
+
+        template <typename T>
+        static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC2_<T><<<grid, block, 0, stream>>>(
+                    src.data, src.step, src.rows, src.cols,
+                    dst[0].data, dst[0].step,
+                    dst[1].data, dst[1].step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC3_<T><<<grid, block, 0, stream>>>(
+                    src.data, src.step, src.rows, src.cols,
+                    dst[0].data, dst[0].step,
+                    dst[1].data, dst[1].step,
+                    dst[2].data, dst[2].step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        template <typename T>
+        static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            splitC4_<T><<<grid, block, 0, stream>>>(
+                     src.data, src.step, src.rows, src.cols,
+                     dst[0].data, dst[0].step,
+                     dst[1].data, dst[1].step,
+                     dst[2].data, dst[2].step,
+                     dst[3].data, dst[3].step);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+
+        void split(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+        {
+            static SplitFunction split_func_tbl[] =
+            {
+                splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
+                splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
+                splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
+            };
+
+            size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
+            SplitFunction split_func = split_func_tbl[split_func_id];
+
+            if (split_func == 0)
+                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
+
+            split_func(src, dst, stream);
+        }
+    } // namespace split_merge
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaarithm/src/cuda/sub_mat.cu
+++ b/modules/cudaarithm/src/cuda/sub_mat.cu
@@ -0,0 +1,185 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    struct VSub4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vsub4(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VSub4() {}
+        __host__ __device__ __forceinline__ VSub4(const VSub4&) {}
+    };
+
+    struct VSub2 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vsub2(a, b);
+        }
+
+        __host__ __device__ __forceinline__ VSub2() {}
+        __host__ __device__ __forceinline__ VSub2(const VSub2&) {}
+    };
+
+    template <typename T, typename D> struct SubMat : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a - b);
+        }
+
+        __host__ __device__ __forceinline__ SubMat() {}
+        __host__ __device__ __forceinline__ SubMat(const SubMat&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    template <typename T, typename D> struct TransformFunctorTraits< arithm::SubMat<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
+    }
+
+    void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        device::transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename D>
+    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
+    }
+
+    template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/sub_scalar.cu
+++ b/modules/cudaarithm/src/cuda/sub_scalar.cu
@@ -0,0 +1,149 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    template <typename T, typename S, typename D> struct SubScalar : unary_function<T, D>
+    {
+        S val;
+        int scale;
+
+        __host__ SubScalar(S val_, int scale_) : val(val_), scale(scale_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(scale * (a - val));
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::SubScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        SubScalar<T, S, D> op(static_cast<S>(val), inv ? -1 : 1);
+
+        if (mask.data)
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+        else
+            device::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/sum.cu
+++ b/modules/cudaarithm/src/cuda/sum.cu
@@ -0,0 +1,381 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+#include "unroll_detail.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace sum
+{
+    __device__ unsigned int blocks_finished = 0;
+
+    template <typename R, int cn> struct AtomicAdd;
+    template <typename R> struct AtomicAdd<R, 1>
+    {
+        static __device__ void run(R* ptr, R val)
+        {
+            Emulation::glob::atomicAdd(ptr, val);
+        }
+    };
+    template <typename R> struct AtomicAdd<R, 2>
+    {
+        typedef typename TypeVec<R, 2>::vec_type val_type;
+
+        static __device__ void run(R* ptr, val_type val)
+        {
+            Emulation::glob::atomicAdd(ptr, val.x);
+            Emulation::glob::atomicAdd(ptr + 1, val.y);
+        }
+    };
+    template <typename R> struct AtomicAdd<R, 3>
+    {
+        typedef typename TypeVec<R, 3>::vec_type val_type;
+
+        static __device__ void run(R* ptr, val_type val)
+        {
+            Emulation::glob::atomicAdd(ptr, val.x);
+            Emulation::glob::atomicAdd(ptr + 1, val.y);
+            Emulation::glob::atomicAdd(ptr + 2, val.z);
+        }
+    };
+    template <typename R> struct AtomicAdd<R, 4>
+    {
+        typedef typename TypeVec<R, 4>::vec_type val_type;
+
+        static __device__ void run(R* ptr, val_type val)
+        {
+            Emulation::glob::atomicAdd(ptr, val.x);
+            Emulation::glob::atomicAdd(ptr + 1, val.y);
+            Emulation::glob::atomicAdd(ptr + 2, val.z);
+            Emulation::glob::atomicAdd(ptr + 3, val.w);
+        }
+    };
+
+    template <int BLOCK_SIZE, typename R, int cn>
+    struct GlobalReduce
+    {
+        typedef typename TypeVec<R, cn>::vec_type result_type;
+
+        static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
+        {
+        #if __CUDA_ARCH__ >= 200
+            if (tid == 0)
+                AtomicAdd<R, cn>::run((R*) result, sum);
+        #else
+            __shared__ bool is_last;
+
+            if (tid == 0)
+            {
+                result[bid] = sum;
+
+                __threadfence();
+
+                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
+                is_last = (ticket == gridDim.x * gridDim.y - 1);
+            }
+
+            __syncthreads();
+
+            if (is_last)
+            {
+                sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);
+
+                device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
+
+                if (tid == 0)
+                {
+                    result[0] = sum;
+                    blocks_finished = 0;
+                }
+            }
+        #endif
+        }
+    };
+
+    template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
+    __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
+    {
+        typedef typename VecTraits<src_type>::elem_type T;
+        typedef typename VecTraits<result_type>::elem_type R;
+        const int cn = VecTraits<src_type>::cn;
+
+        __shared__ R smem[BLOCK_SIZE * cn];
+
+        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
+        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
+
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const int bid = blockIdx.y * gridDim.x + blockIdx.x;
+
+        result_type sum = VecTraits<result_type>::all(0);
+
+        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
+        {
+            const src_type* ptr = src.ptr(y);
+
+            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
+            {
+                if (mask(y, x))
+                {
+                    const src_type srcVal = ptr[x];
+                    sum = sum + op(saturate_cast<result_type>(srcVal));
+                }
+            }
+        }
+
+        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));
+
+        GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
+    }
+
+    const int threads_x = 32;
+    const int threads_y = 8;
+
+    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
+    {
+        block = dim3(threads_x, threads_y);
+
+        grid = dim3(divUp(cols, block.x * block.y),
+                    divUp(rows, block.y * block.x));
+
+        grid.x = ::min(grid.x, block.x);
+        grid.y = ::min(grid.y, block.y);
+    }
+
+    void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
+    {
+        dim3 block, grid;
+        getLaunchCfg(cols, rows, block, grid);
+
+        bufcols = grid.x * grid.y * sizeof(double) * cn;
+        bufrows = 1;
+    }
+
+    template <typename T, typename R, int cn, template <typename> class Op>
+    void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_type;
+        typedef typename TypeVec<R, cn>::vec_type result_type;
+
+        PtrStepSz<src_type> src(src_);
+        result_type* buf = (result_type*) buf_;
+
+        dim3 block, grid;
+        getLaunchCfg(src.cols, src.rows, block, grid);
+
+        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
+        const int theight = divUp(divUp(src.rows, grid.y), block.y);
+
+        Op<result_type> op;
+
+        if (mask.data)
+            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
+        else
+            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+        R result[4] = {0, 0, 0, 0};
+        cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );
+
+        out[0] = result[0];
+        out[1] = result[1];
+        out[2] = result[2];
+        out[3] = result[3];
+    }
+
+    template <typename T> struct SumType;
+    template <> struct SumType<uchar> { typedef unsigned int R; };
+    template <> struct SumType<schar> { typedef int R; };
+    template <> struct SumType<ushort> { typedef unsigned int R; };
+    template <> struct SumType<short> { typedef int R; };
+    template <> struct SumType<int> { typedef int R; };
+    template <> struct SumType<float> { typedef float R; };
+    template <> struct SumType<double> { typedef double R; };
+
+    template <typename T, int cn>
+    void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
+    {
+        typedef typename SumType<T>::R R;
+        caller<T, R, cn, identity>(src, buf, out, mask);
+    }
+
+    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template <typename T, int cn>
+    void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
+    {
+        typedef typename SumType<T>::R R;
+        caller<T, R, cn, abs_func>(src, buf, out, mask);
+    }
+
+    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template <typename T> struct Sqr : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            return x * x;
+        }
+    };
+
+    template <typename T, int cn>
+    void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
+    {
+        caller<T, double, cn, Sqr>(src, buf, out, mask);
+    }
+
+    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+
+    template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/threshold.cu
+++ b/modules/cudaarithm/src/cuda/threshold.cu
@@ -0,0 +1,114 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/simd_functions.hpp"
+
+#include "arithm_func_traits.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< thresh_binary_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< thresh_binary_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< thresh_trunc_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+
+    template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <template <typename> class Op, typename T>
+    void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
+    {
+        Op<T> op(thresh, maxVal);
+        device::transform(src, dst, op, WithOutMask(), stream);
+    }
+
+    template <typename T>
+    void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
+
+        static const caller_t callers[] =
+        {
+            threshold_caller<thresh_binary_func, T>,
+            threshold_caller<thresh_binary_inv_func, T>,
+            threshold_caller<thresh_trunc_func, T>,
+            threshold_caller<thresh_to_zero_func, T>,
+            threshold_caller<thresh_to_zero_inv_func, T>
+        };
+
+        callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
+    }
+
+    template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/transpose.cu
+++ b/modules/cudaarithm/src/cuda/transpose.cu
@@ -0,0 +1,122 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace arithm
+{
+    const int TRANSPOSE_TILE_DIM   = 16;
+    const int TRANSPOSE_BLOCK_ROWS = 16;
+
+    template <typename T>
+    __global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
+    {
+        __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1];
+
+        int blockIdx_x, blockIdx_y;
+
+        // do diagonal reordering
+        if (gridDim.x == gridDim.y)
+        {
+            blockIdx_y = blockIdx.x;
+            blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
+        }
+        else
+        {
+            int bid = blockIdx.x + gridDim.x * blockIdx.y;
+            blockIdx_y = bid % gridDim.y;
+            blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
+        }
+
+        int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
+        int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
+
+        if (xIndex < src.cols)
+        {
+            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+            {
+                if (yIndex + i < src.rows)
+                {
+                    tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
+        yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
+
+        if (xIndex < src.rows)
+        {
+            for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
+            {
+                if (yIndex + i < src.cols)
+                {
+                    dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
+                }
+            }
+        }
+    }
+
+    template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+    {
+        const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        transposeKernel<<<grid, block, 0, stream>>>(src, dst);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
+    template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaarithm/src/cuda/unroll_detail.hpp
+++ b/modules/cudaarithm/src/cuda/unroll_detail.hpp
@@ -0,0 +1,135 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __UNROLL_DETAIL_HPP__
+#define __UNROLL_DETAIL_HPP__
+
+#include <thrust/tuple.h>
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+
+namespace detail
+{
+    template <int cn> struct Unroll;
+    template <> struct Unroll<1>
+    {
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
+        {
+            return smem;
+        }
+
+        template <typename R>
+        static __device__ __forceinline__ R& tie(R& val)
+        {
+            return val;
+        }
+
+        template <class Op>
+        static __device__ __forceinline__ const Op& op(const Op& op)
+        {
+            return op;
+        }
+    };
+    template <> struct Unroll<2>
+    {
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
+        {
+            return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE);
+        }
+
+        template <typename R>
+        static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
+        {
+            return thrust::tie(val.x, val.y);
+        }
+
+        template <class Op>
+        static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
+        {
+            return thrust::make_tuple(op, op);
+        }
+    };
+    template <> struct Unroll<3>
+    {
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
+        {
+            return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+        }
+
+        template <typename R>
+        static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
+        {
+            return thrust::tie(val.x, val.y, val.z);
+        }
+
+        template <class Op>
+        static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
+        {
+            return thrust::make_tuple(op, op, op);
+        }
+    };
+    template <> struct Unroll<4>
+    {
+        template <int BLOCK_SIZE, typename R>
+        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
+        {
+            return cv::cuda::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+        }
+
+        template <typename R>
+        static __device__ __forceinline__ thrust::tuple<typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&, typename cv::cuda::device::VecTraits<R>::elem_type&> tie(R& val)
+        {
+            return thrust::tie(val.x, val.y, val.z, val.w);
+        }
+
+        template <class Op>
+        static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
+        {
+            return thrust::make_tuple(op, op, op, op);
+        }
+    };
+}
+
+#endif // __UNROLL_DETAIL_HPP__
--- a/modules/cudaarithm/src/element_operations.cpp
+++ b/modules/cudaarithm/src/element_operations.cpp
--- a/modules/cudaarithm/src/precomp.cpp
+++ b/modules/cudaarithm/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/cudaarithm/src/precomp.hpp
+++ b/modules/cudaarithm/src/precomp.hpp
@@ -0,0 +1,70 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+
+#include "cvconfig.h"
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDALEGACY
+#  include "opencv2/cudalegacy.hpp"
+#  include "opencv2/cudalegacy/private.hpp"
+#endif
+
+#ifdef HAVE_CUBLAS
+#  include <cublas.h>
+#endif
+
+#ifdef HAVE_CUFFT
+#  include <cufft.h>
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */
--- a/modules/cudaarithm/src/reductions.cpp
+++ b/modules/cudaarithm/src/reductions.cpp
@@ -0,0 +1,863 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+double cv::cuda::norm(InputArray, int, InputArray, GpuMat&) { throw_no_cuda(); return 0.0; }
+double cv::cuda::norm(InputArray, InputArray, GpuMat&, int) { throw_no_cuda(); return 0.0; }
+
+Scalar cv::cuda::sum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::cuda::absSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+Scalar cv::cuda::sqrSum(InputArray, InputArray, GpuMat&) { throw_no_cuda(); return Scalar(); }
+
+void cv::cuda::minMax(InputArray, double*, double*, InputArray, GpuMat&) { throw_no_cuda(); }
+void cv::cuda::minMaxLoc(InputArray, double*, double*, Point*, Point*, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
+int cv::cuda::countNonZero(InputArray, GpuMat&) { throw_no_cuda(); return 0; }
+
+void cv::cuda::reduce(InputArray, OutputArray, int, int, int, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::meanStdDev(InputArray, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
+
+void cv::cuda::rectStdDev(InputArray, InputArray, OutputArray, Rect, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::normalize(InputArray, OutputArray, double, double, int, int, InputArray, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
+void cv::cuda::integral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::cuda::sqrIntegral(InputArray, OutputArray, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace
+{
+    class DeviceBuffer
+    {
+    public:
+        explicit DeviceBuffer(int count_ = 1) : count(count_)
+        {
+            cudaSafeCall( cudaMalloc(&pdev, count * sizeof(double)) );
+        }
+        ~DeviceBuffer()
+        {
+            cudaSafeCall( cudaFree(pdev) );
+        }
+
+        operator double*() {return pdev;}
+
+        void download(double* hptr)
+        {
+            double hbuf;
+            cudaSafeCall( cudaMemcpy(&hbuf, pdev, sizeof(double), cudaMemcpyDeviceToHost) );
+            *hptr = hbuf;
+        }
+        void download(double** hptrs)
+        {
+            AutoBuffer<double, 2 * sizeof(double)> hbuf(count);
+            cudaSafeCall( cudaMemcpy((void*)hbuf, pdev, count * sizeof(double), cudaMemcpyDeviceToHost) );
+            for (int i = 0; i < count; ++i)
+                *hptrs[i] = hbuf[i];
+        }
+
+    private:
+        double* pdev;
+        int count;
+    };
+}
+
+////////////////////////////////////////////////////////////////////////
+// norm
+
+double cv::cuda::norm(InputArray _src, int normType, InputArray _mask, GpuMat& buf)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1) );
+
+    GpuMat src_single_channel = src.reshape(1);
+
+    if (normType == NORM_L1)
+        return cuda::absSum(src_single_channel, mask, buf)[0];
+
+    if (normType == NORM_L2)
+        return std::sqrt(cuda::sqrSum(src_single_channel, mask, buf)[0]);
+
+    // NORM_INF
+    double min_val, max_val;
+    cuda::minMax(src_single_channel, &min_val, &max_val, mask, buf);
+    return std::max(std::abs(min_val), std::abs(max_val));
+}
+
+double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normType)
+{
+#if CUDA_VERSION < 5050
+    (void) buf;
+
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2, NppiSize oSizeROI, Npp64f* pRetVal);
+
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+#else
+    typedef NppStatus (*func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
+        NppiSize oSizeROI, Npp64f* pRetVal, Npp8u * pDeviceBuffer);
+
+    typedef NppStatus (*buf_size_func_t)(NppiSize oSizeROI, int* hpBufferSize);
+
+    static const func_t funcs[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
+
+    static const buf_size_func_t buf_size_funcs[] = {nppiNormDiffInfGetBufferHostSize_8u_C1R, nppiNormDiffL1GetBufferHostSize_8u_C1R, nppiNormDiffL2GetBufferHostSize_8u_C1R};
+#endif
+
+    GpuMat src1 = _src1.getGpuMat();
+    GpuMat src2 = _src2.getGpuMat();
+
+    CV_Assert( src1.type() == CV_8UC1 );
+    CV_Assert( src1.size() == src2.size() && src1.type() == src2.type() );
+    CV_Assert( normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2 );
+
+    NppiSize sz;
+    sz.width  = src1.cols;
+    sz.height = src1.rows;
+
+    const int funcIdx = normType >> 1;
+
+    DeviceBuffer dbuf;
+
+#if CUDA_VERSION < 5050
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
+#else
+    int bufSize;
+    buf_size_funcs[funcIdx](sz, &bufSize);
+
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+
+    nppSafeCall( funcs[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf, buf.data) );
+#endif
+
+    cudaSafeCall( cudaDeviceSynchronize() );
+
+    double retVal;
+    dbuf.download(&retVal);
+
+    return retVal;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Sum
+
+namespace sum
+{
+    void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
+
+    template <typename T, int cn>
+    void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+
+    template <typename T, int cn>
+    void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+
+    template <typename T, int cn>
+    void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+}
+
+Scalar cv::cuda::sum(InputArray _src, InputArray _mask, GpuMat& buf)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
+        {0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
+        {0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
+        {0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
+        {0, ::sum::run<int   , 1>, ::sum::run<int   , 2>, ::sum::run<int   , 3>, ::sum::run<int   , 4>},
+        {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
+        {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
+    };
+
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    Size buf_size;
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
+
+    const func_t func = funcs[src.depth()][src.channels()];
+
+    double result[4];
+    func(src, buf.data, result, mask);
+
+    return Scalar(result[0], result[1], result[2], result[3]);
+}
+
+Scalar cv::cuda::absSum(InputArray _src, InputArray _mask, GpuMat& buf)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
+        {0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
+        {0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
+        {0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
+        {0, ::sum::runAbs<int   , 1>, ::sum::runAbs<int   , 2>, ::sum::runAbs<int   , 3>, ::sum::runAbs<int   , 4>},
+        {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
+        {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
+    };
+
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    Size buf_size;
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
+
+    const func_t func = funcs[src.depth()][src.channels()];
+
+    double result[4];
+    func(src, buf.data, result, mask);
+
+    return Scalar(result[0], result[1], result[2], result[3]);
+}
+
+Scalar cv::cuda::sqrSum(InputArray _src, InputArray _mask, GpuMat& buf)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
+        {0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
+        {0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
+        {0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
+        {0, ::sum::runSqr<int   , 1>, ::sum::runSqr<int   , 2>, ::sum::runSqr<int   , 3>, ::sum::runSqr<int   , 4>},
+        {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
+        {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
+    };
+
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    Size buf_size;
+    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+    buf.setTo(Scalar::all(0));
+
+    const func_t func = funcs[src.depth()][src.channels()];
+
+    double result[4];
+    func(src, buf.data, result, mask);
+
+    return Scalar(result[0], result[1], result[2], result[3]);
+}
+
+////////////////////////////////////////////////////////////////////////
+// minMax
+
+namespace minMax
+{
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
+
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+}
+
+void cv::cuda::minMax(InputArray _src, double* minVal, double* maxVal, InputArray _mask, GpuMat& buf)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    static const func_t funcs[] =
+    {
+        ::minMax::run<uchar>,
+        ::minMax::run<schar>,
+        ::minMax::run<ushort>,
+        ::minMax::run<short>,
+        ::minMax::run<int>,
+        ::minMax::run<float>,
+        ::minMax::run<double>
+    };
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    Size buf_size;
+    ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+
+    const func_t func = funcs[src.depth()];
+
+    double temp1, temp2;
+    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
+}
+
+////////////////////////////////////////////////////////////////////////
+// minMaxLoc
+
+namespace minMaxLoc
+{
+    void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
+
+    template <typename T>
+    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+}
+
+void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
+                        InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat mask = _mask.getGpuMat();
+
+    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+    static const func_t funcs[] =
+    {
+        ::minMaxLoc::run<uchar>,
+        ::minMaxLoc::run<schar>,
+        ::minMaxLoc::run<ushort>,
+        ::minMaxLoc::run<short>,
+        ::minMaxLoc::run<int>,
+        ::minMaxLoc::run<float>,
+        ::minMaxLoc::run<double>
+    };
+
+    CV_Assert( src.channels() == 1 );
+    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
+
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    Size valbuf_size, locbuf_size;
+    ::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
+    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
+    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
+
+    const func_t func = funcs[src.depth()];
+
+    double temp1, temp2;
+    Point temp3, temp4;
+    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// countNonZero
+
+namespace countNonZero
+{
+    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
+
+    template <typename T>
+    int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
+}
+
+int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
+{
+    GpuMat src = _src.getGpuMat();
+
+    typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
+    static const func_t funcs[] =
+    {
+        ::countNonZero::run<uchar>,
+        ::countNonZero::run<schar>,
+        ::countNonZero::run<ushort>,
+        ::countNonZero::run<short>,
+        ::countNonZero::run<int>,
+        ::countNonZero::run<float>,
+        ::countNonZero::run<double>
+    };
+
+    CV_Assert(src.channels() == 1);
+
+    if (src.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    Size buf_size;
+    ::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
+    ensureSizeIsEnough(buf_size, CV_8U, buf);
+
+    const func_t func = funcs[src.depth()];
+
+    return func(src, buf);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// reduce
+
+namespace reduce
+{
+    template <typename T, typename S, typename D>
+    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+
+    template <typename T, typename S, typename D>
+    void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+}
+
+void cv::cuda::reduce(InputArray _src, OutputArray _dst, int dim, int reduceOp, int dtype, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.channels() <= 4 );
+    CV_Assert( dim == 0 || dim == 1 );
+    CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
+
+    if (dtype < 0)
+        dtype = src.depth();
+
+    _dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    GpuMat dst = _dst.getGpuMat();
+
+    if (dim == 0)
+    {
+        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+        static const func_t funcs[7][7] =
+        {
+            {
+                ::reduce::rows<unsigned char, int, unsigned char>,
+                0/*::reduce::rows<unsigned char, int, signed char>*/,
+                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
+                0/*::reduce::rows<unsigned char, int, short>*/,
+                ::reduce::rows<unsigned char, int, int>,
+                ::reduce::rows<unsigned char, float, float>,
+                ::reduce::rows<unsigned char, double, double>
+            },
+            {
+                0/*::reduce::rows<signed char, int, unsigned char>*/,
+                0/*::reduce::rows<signed char, int, signed char>*/,
+                0/*::reduce::rows<signed char, int, unsigned short>*/,
+                0/*::reduce::rows<signed char, int, short>*/,
+                0/*::reduce::rows<signed char, int, int>*/,
+                0/*::reduce::rows<signed char, float, float>*/,
+                0/*::reduce::rows<signed char, double, double>*/
+            },
+            {
+                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
+                0/*::reduce::rows<unsigned short, int, signed char>*/,
+                ::reduce::rows<unsigned short, int, unsigned short>,
+                0/*::reduce::rows<unsigned short, int, short>*/,
+                ::reduce::rows<unsigned short, int, int>,
+                ::reduce::rows<unsigned short, float, float>,
+                ::reduce::rows<unsigned short, double, double>
+            },
+            {
+                0/*::reduce::rows<short, int, unsigned char>*/,
+                0/*::reduce::rows<short, int, signed char>*/,
+                0/*::reduce::rows<short, int, unsigned short>*/,
+                ::reduce::rows<short, int, short>,
+                ::reduce::rows<short, int, int>,
+                ::reduce::rows<short, float, float>,
+                ::reduce::rows<short, double, double>
+            },
+            {
+                0/*::reduce::rows<int, int, unsigned char>*/,
+                0/*::reduce::rows<int, int, signed char>*/,
+                0/*::reduce::rows<int, int, unsigned short>*/,
+                0/*::reduce::rows<int, int, short>*/,
+                ::reduce::rows<int, int, int>,
+                ::reduce::rows<int, float, float>,
+                ::reduce::rows<int, double, double>
+            },
+            {
+                0/*::reduce::rows<float, float, unsigned char>*/,
+                0/*::reduce::rows<float, float, signed char>*/,
+                0/*::reduce::rows<float, float, unsigned short>*/,
+                0/*::reduce::rows<float, float, short>*/,
+                0/*::reduce::rows<float, float, int>*/,
+                ::reduce::rows<float, float, float>,
+                ::reduce::rows<float, double, double>
+            },
+            {
+                0/*::reduce::rows<double, double, unsigned char>*/,
+                0/*::reduce::rows<double, double, signed char>*/,
+                0/*::reduce::rows<double, double, unsigned short>*/,
+                0/*::reduce::rows<double, double, short>*/,
+                0/*::reduce::rows<double, double, int>*/,
+                0/*::reduce::rows<double, double, float>*/,
+                ::reduce::rows<double, double, double>
+            }
+        };
+
+        const func_t func = funcs[src.depth()][dst.depth()];
+
+        if (!func)
+            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
+
+        func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+        static const func_t funcs[7][7] =
+        {
+            {
+                ::reduce::cols<unsigned char, int, unsigned char>,
+                0/*::reduce::cols<unsigned char, int, signed char>*/,
+                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
+                0/*::reduce::cols<unsigned char, int, short>*/,
+                ::reduce::cols<unsigned char, int, int>,
+                ::reduce::cols<unsigned char, float, float>,
+                ::reduce::cols<unsigned char, double, double>
+            },
+            {
+                0/*::reduce::cols<signed char, int, unsigned char>*/,
+                0/*::reduce::cols<signed char, int, signed char>*/,
+                0/*::reduce::cols<signed char, int, unsigned short>*/,
+                0/*::reduce::cols<signed char, int, short>*/,
+                0/*::reduce::cols<signed char, int, int>*/,
+                0/*::reduce::cols<signed char, float, float>*/,
+                0/*::reduce::cols<signed char, double, double>*/
+            },
+            {
+                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
+                0/*::reduce::cols<unsigned short, int, signed char>*/,
+                ::reduce::cols<unsigned short, int, unsigned short>,
+                0/*::reduce::cols<unsigned short, int, short>*/,
+                ::reduce::cols<unsigned short, int, int>,
+                ::reduce::cols<unsigned short, float, float>,
+                ::reduce::cols<unsigned short, double, double>
+            },
+            {
+                0/*::reduce::cols<short, int, unsigned char>*/,
+                0/*::reduce::cols<short, int, signed char>*/,
+                0/*::reduce::cols<short, int, unsigned short>*/,
+                ::reduce::cols<short, int, short>,
+                ::reduce::cols<short, int, int>,
+                ::reduce::cols<short, float, float>,
+                ::reduce::cols<short, double, double>
+            },
+            {
+                0/*::reduce::cols<int, int, unsigned char>*/,
+                0/*::reduce::cols<int, int, signed char>*/,
+                0/*::reduce::cols<int, int, unsigned short>*/,
+                0/*::reduce::cols<int, int, short>*/,
+                ::reduce::cols<int, int, int>,
+                ::reduce::cols<int, float, float>,
+                ::reduce::cols<int, double, double>
+            },
+            {
+                0/*::reduce::cols<float, float, unsigned char>*/,
+                0/*::reduce::cols<float, float, signed char>*/,
+                0/*::reduce::cols<float, float, unsigned short>*/,
+                0/*::reduce::cols<float, float, short>*/,
+                0/*::reduce::cols<float, float, int>*/,
+                ::reduce::cols<float, float, float>,
+                ::reduce::cols<float, double, double>
+            },
+            {
+                0/*::reduce::cols<double, double, unsigned char>*/,
+                0/*::reduce::cols<double, double, signed char>*/,
+                0/*::reduce::cols<double, double, unsigned short>*/,
+                0/*::reduce::cols<double, double, short>*/,
+                0/*::reduce::cols<double, double, int>*/,
+                0/*::reduce::cols<double, double, float>*/,
+                ::reduce::cols<double, double, double>
+            }
+        };
+
+        const func_t func = funcs[src.depth()][dst.depth()];
+
+        if (!func)
+            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
+
+        func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// meanStdDev
+
+void cv::cuda::meanStdDev(InputArray _src, Scalar& mean, Scalar& stddev, GpuMat& buf)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
+        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
+
+    NppiSize sz;
+    sz.width  = src.cols;
+    sz.height = src.rows;
+
+    DeviceBuffer dbuf(2);
+
+    int bufSize;
+#if (CUDA_VERSION <= 4020)
+    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
+#else
+    nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
+#endif
+
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
+
+    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
+
+    cudaSafeCall( cudaDeviceSynchronize() );
+
+    double* ptrs[2] = {mean.val, stddev.val};
+    dbuf.download(ptrs);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rectStdDev
+
+void cv::cuda::rectStdDev(InputArray _src, InputArray _sqr, OutputArray _dst, Rect rect, Stream& _stream)
+{
+    GpuMat src = _src.getGpuMat();
+    GpuMat sqr = _sqr.getGpuMat();
+
+    CV_Assert( src.type() == CV_32SC1 && sqr.type() == CV_64FC1 );
+
+    _dst.create(src.size(), CV_32FC1);
+    GpuMat dst = _dst.getGpuMat();
+
+    NppiSize sz;
+    sz.width = src.cols;
+    sz.height = src.rows;
+
+    NppiRect nppRect;
+    nppRect.height = rect.height;
+    nppRect.width = rect.width;
+    nppRect.x = rect.x;
+    nppRect.y = rect.y;
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    NppStreamHandler h(stream);
+
+    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp64f>(), static_cast<int>(sqr.step),
+                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+////////////////////////////////////////////////////////////////////////
+// normalize
+
+void cv::cuda::normalize(InputArray _src, OutputArray dst, double a, double b, int norm_type, int dtype, InputArray mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+{
+    GpuMat src = _src.getGpuMat();
+
+    double scale = 1, shift = 0;
+
+    if (norm_type == NORM_MINMAX)
+    {
+        double smin = 0, smax = 0;
+        double dmin = std::min(a, b), dmax = std::max(a, b);
+        cuda::minMax(src, &smin, &smax, mask, norm_buf);
+        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        shift = dmin - smin * scale;
+    }
+    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
+    {
+        scale = cuda::norm(src, norm_type, mask, norm_buf);
+        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
+        shift = 0;
+    }
+    else
+    {
+        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
+    }
+
+    if (mask.empty())
+    {
+        src.convertTo(dst, dtype, scale, shift);
+    }
+    else
+    {
+        src.convertTo(cvt_buf, dtype, scale, shift);
+        cvt_buf.copyTo(dst, mask);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////
+// integral
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::integral(InputArray _src, OutputArray _dst, GpuMat& buffer, Stream& _stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    cv::Size whole;
+    cv::Point offset;
+    src.locateROI(whole, offset);
+
+    if (deviceSupports(WARP_SHUFFLE_FUNCTIONS) && src.cols <= 2048
+        && offset.x % 16 == 0 && ((src.cols + 63) / 64) * 64 <= (static_cast<int>(src.step) - offset.x))
+    {
+        ensureSizeIsEnough(((src.rows + 7) / 8) * 8, ((src.cols + 63) / 64) * 64, CV_32SC1, buffer);
+
+        cv::cuda::device::imgproc::shfl_integral_gpu(src, buffer, stream);
+
+        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        dst.setTo(Scalar::all(0), _stream);
+
+        GpuMat inner = dst(Rect(1, 1, src.cols, src.rows));
+        GpuMat res = buffer(Rect(0, 0, src.cols, src.rows));
+
+        res.copyTo(inner, _stream);
+    }
+    else
+    {
+    #ifndef HAVE_OPENCV_CUDALEGACY
+        throw_no_cuda();
+    #else
+        _dst.create(src.rows + 1, src.cols + 1, CV_32SC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        NcvSize32u roiSize;
+        roiSize.width = src.cols;
+        roiSize.height = src.rows;
+
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
+
+        Ncv32u bufSize;
+        ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
+        ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);
+
+        NppStStreamHandler h(stream);
+
+        ncvSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), static_cast<int>(src.step),
+            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    #endif
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// sqrIntegral
+
+void cv::cuda::sqrIntegral(InputArray _src, OutputArray _dst, GpuMat& buf, Stream& _stream)
+{
+#ifndef HAVE_OPENCV_CUDALEGACY
+    (void) _src;
+    (void) _dst;
+    (void) _stream;
+    throw_no_cuda();
+#else
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8U );
+
+    NcvSize32u roiSize;
+    roiSize.width = src.cols;
+    roiSize.height = src.rows;
+
+    cudaDeviceProp prop;
+    cudaSafeCall( cudaGetDeviceProperties(&prop, cv::cuda::getDevice()) );
+
+    Ncv32u bufSize;
+    ncvSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));
+
+    ensureSizeIsEnough(1, bufSize, CV_8U, buf);
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+    NppStStreamHandler h(stream);
+
+    _dst.create(src.rows + 1, src.cols + 1, CV_64F);
+    GpuMat dst = _dst.getGpuMat();
+
+    ncvSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), static_cast<int>(src.step),
+            dst.ptr<Ncv64u>(0), static_cast<int>(dst.step), roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+#endif
+}
+
+#endif
--- a/modules/cudaarithm/test/test_arithm.cpp
+++ b/modules/cudaarithm/test/test_arithm.cpp
@@ -0,0 +1,441 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+//////////////////////////////////////////////////////////////////////////////
+// GEMM
+
+#ifdef HAVE_CUBLAS
+
+CV_FLAGS(GemmFlags, 0, cv::GEMM_1_T, cv::GEMM_2_T, cv::GEMM_3_T);
+#define ALL_GEMM_FLAGS testing::Values(GemmFlags(0), GemmFlags(cv::GEMM_1_T), GemmFlags(cv::GEMM_2_T), GemmFlags(cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_3_T), GemmFlags(cv::GEMM_1_T | cv::GEMM_2_T | cv::GEMM_3_T))
+
+PARAM_TEST_CASE(GEMM, cv::cuda::DeviceInfo, cv::Size, MatType, GemmFlags, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int flags;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        flags = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(GEMM, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, type, -10.0, 10.0);
+    cv::Mat src2 = randomMat(size, type, -10.0, 10.0);
+    cv::Mat src3 = randomMat(size, type, -10.0, 10.0);
+    double alpha = randomDouble(-10.0, 10.0);
+    double beta = randomDouble(-10.0, 10.0);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else if (type == CV_64FC2 && flags != 0)
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::gemm(loadMat(src1), loadMat(src2), alpha, loadMat(src3), beta, dst, flags);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+        cv::cuda::gemm(loadMat(src1, useRoi), loadMat(src2, useRoi), alpha, loadMat(src3, useRoi), beta, dst, flags);
+
+        cv::Mat dst_gold;
+        cv::gemm(src1, src2, alpha, src3, beta, dst_gold, flags);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1e-10);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, GEMM, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_32FC1), MatType(CV_32FC2), MatType(CV_64FC1), MatType(CV_64FC2)),
+    ALL_GEMM_FLAGS,
+    WHOLE_SUBMAT));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// Integral
+
+PARAM_TEST_CASE(Integral, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Integral, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols + 1, src.rows + 1), CV_32SC1, useRoi);
+    cv::cuda::integral(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::integral(src, dst_gold, CV_32S);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Integral, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////
+// MulSpectrums
+
+CV_FLAGS(DftFlags, 0, cv::DFT_INVERSE, cv::DFT_SCALE, cv::DFT_ROWS, cv::DFT_COMPLEX_OUTPUT, cv::DFT_REAL_OUTPUT)
+
+PARAM_TEST_CASE(MulSpectrums, cv::cuda::DeviceInfo, cv::Size, DftFlags)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int flag;
+
+    cv::Mat a, b;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        flag = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        a = randomMat(size, CV_32FC2);
+        b = randomMat(size, CV_32FC2);
+    }
+};
+
+GPU_TEST_P(MulSpectrums, Simple)
+{
+    cv::cuda::GpuMat c;
+    cv::cuda::mulSpectrums(loadMat(a), loadMat(b), c, flag, false);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, false);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
+}
+
+GPU_TEST_P(MulSpectrums, Scaled)
+{
+    float scale = 1.f / size.area();
+
+    cv::cuda::GpuMat c;
+    cv::cuda::mulAndScaleSpectrums(loadMat(a), loadMat(b), c, flag, scale, false);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, false);
+    c_gold.convertTo(c_gold, c_gold.type(), scale);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, MulSpectrums, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(DftFlags(0), DftFlags(cv::DFT_ROWS))));
+
+////////////////////////////////////////////////////////////////////////////
+// Dft
+
+struct Dft : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+namespace
+{
+    void testC2C(const std::string& hint, int cols, int rows, int flags, bool inplace)
+    {
+        SCOPED_TRACE(hint);
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+
+        cv::cuda::GpuMat d_b;
+        cv::cuda::GpuMat d_b_data;
+        if (inplace)
+        {
+            d_b_data.create(1, a.size().area(), CV_32FC2);
+            d_b = cv::cuda::GpuMat(a.rows, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+        }
+        cv::cuda::dft(loadMat(a), d_b, cv::Size(cols, rows), flags);
+
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
+}
+
+GPU_TEST_P(Dft, C2C)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    for (int i = 0; i < 2; ++i)
+    {
+        bool inplace = i != 0;
+
+        testC2C("no flags", cols, rows, 0, inplace);
+        testC2C("no flags 0 1", cols, rows + 1, 0, inplace);
+        testC2C("no flags 1 0", cols, rows + 1, 0, inplace);
+        testC2C("no flags 1 1", cols + 1, rows, 0, inplace);
+        testC2C("DFT_INVERSE", cols, rows, cv::DFT_INVERSE, inplace);
+        testC2C("DFT_ROWS", cols, rows, cv::DFT_ROWS, inplace);
+        testC2C("single col", 1, rows, 0, inplace);
+        testC2C("single row", cols, 1, 0, inplace);
+        testC2C("single col inversed", 1, rows, cv::DFT_INVERSE, inplace);
+        testC2C("single row inversed", cols, 1, cv::DFT_INVERSE, inplace);
+        testC2C("single row DFT_ROWS", cols, 1, cv::DFT_ROWS, inplace);
+        testC2C("size 1 2", 1, 2, 0, inplace);
+        testC2C("size 2 1", 2, 1, 0, inplace);
+    }
+}
+
+namespace
+{
+    void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
+    {
+        SCOPED_TRACE(hint);
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC1, 0.0, 10.0);
+
+        cv::cuda::GpuMat d_b, d_c;
+        cv::cuda::GpuMat d_b_data, d_c_data;
+        if (inplace)
+        {
+            if (a.cols == 1)
+            {
+                d_b_data.create(1, (a.rows / 2 + 1) * a.cols, CV_32FC2);
+                d_b = cv::cuda::GpuMat(a.rows / 2 + 1, a.cols, CV_32FC2, d_b_data.ptr(), a.cols * d_b_data.elemSize());
+            }
+            else
+            {
+                d_b_data.create(1, a.rows * (a.cols / 2 + 1), CV_32FC2);
+                d_b = cv::cuda::GpuMat(a.rows, a.cols / 2 + 1, CV_32FC2, d_b_data.ptr(), (a.cols / 2 + 1) * d_b_data.elemSize());
+            }
+            d_c_data.create(1, a.size().area(), CV_32F);
+            d_c = cv::cuda::GpuMat(a.rows, a.cols, CV_32F, d_c_data.ptr(), a.cols * d_c_data.elemSize());
+        }
+
+        cv::cuda::dft(loadMat(a), d_b, cv::Size(cols, rows), 0);
+        cv::cuda::dft(d_b, d_c, cv::Size(cols, rows), cv::DFT_REAL_OUTPUT | cv::DFT_SCALE);
+
+        EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
+        EXPECT_TRUE(!inplace || d_c.ptr() == d_c_data.ptr());
+        ASSERT_EQ(CV_32F, d_c.depth());
+        ASSERT_EQ(1, d_c.channels());
+
+        cv::Mat c(d_c);
+        EXPECT_MAT_NEAR(a, c, rows * cols * 1e-5);
+    }
+}
+
+GPU_TEST_P(Dft, R2CThenC2R)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    testR2CThenC2R("sanity", cols, rows, false);
+    testR2CThenC2R("sanity 0 1", cols, rows + 1, false);
+    testR2CThenC2R("sanity 1 0", cols + 1, rows, false);
+    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, false);
+    testR2CThenC2R("single col", 1, rows, false);
+    testR2CThenC2R("single col 1", 1, rows + 1, false);
+    testR2CThenC2R("single row", cols, 1, false);
+    testR2CThenC2R("single row 1", cols + 1, 1, false);
+
+    testR2CThenC2R("sanity", cols, rows, true);
+    testR2CThenC2R("sanity 0 1", cols, rows + 1, true);
+    testR2CThenC2R("sanity 1 0", cols + 1, rows, true);
+    testR2CThenC2R("sanity 1 1", cols + 1, rows + 1, true);
+    testR2CThenC2R("single row", cols, 1, true);
+    testR2CThenC2R("single row 1", cols + 1, 1, true);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Dft, ALL_DEVICES);
+
+////////////////////////////////////////////////////////
+// Convolve
+
+namespace
+{
+    void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+    {
+        // reallocate the output array if needed
+        C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
+        cv::Size dftSize;
+
+        // compute the size of DFT transform
+        dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
+        dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
+
+        // allocate temporary buffers and initialize them with 0s
+        cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
+        cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
+
+        // copy A and B to the top-left corners of tempA and tempB, respectively
+        cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
+        A.copyTo(roiA);
+        cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
+        B.copyTo(roiB);
+
+        // now transform the padded A & B in-place;
+        // use "nonzeroRows" hint for faster processing
+        cv::dft(tempA, tempA, 0, A.rows);
+        cv::dft(tempB, tempB, 0, B.rows);
+
+        // multiply the spectrums;
+        // the function handles packed spectrum representations well
+        cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
+
+        // transform the product back from the frequency domain.
+        // Even though all the result rows will be non-zero,
+        // you need only the first C.rows of them, and thus you
+        // pass nonzeroRows == C.rows
+        cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
+
+        // now copy the result back to C.
+        tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+    }
+
+    IMPLEMENT_PARAM_CLASS(KSize, int)
+    IMPLEMENT_PARAM_CLASS(Ccorr, bool)
+}
+
+PARAM_TEST_CASE(Convolve, cv::cuda::DeviceInfo, cv::Size, KSize, Ccorr)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int ksize;
+    bool ccorr;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        ksize = GET_PARAM(2);
+        ccorr = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Convolve, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
+    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::cuda::Convolution> conv = cv::cuda::createConvolution();
+
+    cv::cuda::GpuMat dst;
+    conv->convolve(loadMat(src), loadMat(kernel), dst, ccorr);
+
+    cv::Mat dst_gold;
+    convolveDFT(src, kernel, dst_gold, ccorr);
+
+    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Convolve, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(KSize(3), KSize(7), KSize(11), KSize(17), KSize(19), KSize(23), KSize(45)),
+    testing::Values(Ccorr(false), Ccorr(true))));
+
+#endif // HAVE_CUBLAS
+
+#endif // HAVE_CUDA
--- a/modules/cudaarithm/test/test_core.cpp
+++ b/modules/cudaarithm/test/test_core.cpp
@@ -0,0 +1,419 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+////////////////////////////////////////////////////////////////////////////////
+// Merge
+
+PARAM_TEST_CASE(Merge, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Merge, Accuracy)
+{
+    std::vector<cv::Mat> src;
+    src.reserve(channels);
+    for (int i = 0; i < channels; ++i)
+        src.push_back(cv::Mat(size, depth, cv::Scalar::all(i)));
+
+    std::vector<cv::cuda::GpuMat> d_src;
+    for (int i = 0; i < channels; ++i)
+        d_src.push_back(loadMat(src[i], useRoi));
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::merge(d_src, dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst;
+        cv::cuda::merge(d_src, dst);
+
+        cv::Mat dst_gold;
+        cv::merge(src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Merge, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Split
+
+PARAM_TEST_CASE(Split, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    bool useRoi;
+
+    int type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, channels);
+    }
+};
+
+GPU_TEST_P(Split, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            std::vector<cv::cuda::GpuMat> dst;
+            cv::cuda::split(loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        std::vector<cv::cuda::GpuMat> dst;
+        cv::cuda::split(loadMat(src, useRoi), dst);
+
+        std::vector<cv::Mat> dst_gold;
+        cv::split(src, dst_gold);
+
+        ASSERT_EQ(dst_gold.size(), dst.size());
+
+        for (size_t i = 0; i < dst_gold.size(); ++i)
+        {
+            EXPECT_MAT_NEAR(dst_gold[i], dst[i], 0.0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Split, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(1, 2, 3, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Transpose
+
+PARAM_TEST_CASE(Transpose, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Transpose, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    if (CV_MAT_DEPTH(type) == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::GpuMat dst;
+            cv::cuda::transpose(loadMat(src), dst);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::cuda::GpuMat dst = createMat(cv::Size(size.height, size.width), type, useRoi);
+        cv::cuda::transpose(loadMat(src, useRoi), dst);
+
+        cv::Mat dst_gold;
+        cv::transpose(src, dst_gold);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Transpose, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC4),
+                    MatType(CV_16UC2),
+                    MatType(CV_16SC2),
+                    MatType(CV_32SC1),
+                    MatType(CV_32SC2),
+                    MatType(CV_64FC1)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// Flip
+
+enum {FLIP_BOTH = 0, FLIP_X = 1, FLIP_Y = -1};
+CV_ENUM(FlipCode, FLIP_BOTH, FLIP_X, FLIP_Y)
+#define ALL_FLIP_CODES testing::Values(FlipCode(FLIP_BOTH), FlipCode(FLIP_X), FlipCode(FLIP_Y))
+
+PARAM_TEST_CASE(Flip, cv::cuda::DeviceInfo, cv::Size, MatType, FlipCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int flip_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        flip_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Flip, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::flip(loadMat(src, useRoi), dst, flip_code);
+
+    cv::Mat dst_gold;
+    cv::flip(src, dst_gold, flip_code);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Flip, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC3),
+                    MatType(CV_8UC4),
+                    MatType(CV_16UC1),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_32SC1),
+                    MatType(CV_32SC3),
+                    MatType(CV_32SC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4)),
+    ALL_FLIP_CODES,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// LUT
+
+PARAM_TEST_CASE(LUT, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(LUT, OneChannel)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat lut = randomMat(cv::Size(256, 1), CV_8UC1);
+
+    cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()));
+    lutAlg->transform(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::LUT(src, lut, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+GPU_TEST_P(LUT, MultiChannel)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat lut = randomMat(cv::Size(256, 1), CV_MAKE_TYPE(CV_8U, src.channels()));
+
+    cv::Ptr<cv::cuda::LookUpTable> lutAlg = cv::cuda::createLookUpTable(lut);
+
+    cv::cuda::GpuMat dst = createMat(size, CV_MAKE_TYPE(lut.depth(), src.channels()), useRoi);
+    lutAlg->transform(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::LUT(src, lut, dst_gold);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, LUT, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3)),
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// CopyMakeBorder
+
+namespace
+{
+    IMPLEMENT_PARAM_CLASS(Border, int)
+}
+
+PARAM_TEST_CASE(CopyMakeBorder, cv::cuda::DeviceInfo, cv::Size, MatType, Border, BorderType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int border;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        border = GET_PARAM(3);
+        borderType = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(CopyMakeBorder, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar val = randomScalar(0, 255);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(size.width + 2 * border, size.height + 2 * border), type, useRoi);
+    cv::cuda::copyMakeBorder(loadMat(src, useRoi), dst, border, border, border, border, borderType, val);
+
+    cv::Mat dst_gold;
+    cv::copyMakeBorder(src, dst_gold, border, border, border, border, borderType, val);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, CopyMakeBorder, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC3),
+                    MatType(CV_8UC4),
+                    MatType(CV_16UC1),
+                    MatType(CV_16UC3),
+                    MatType(CV_16UC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4)),
+    testing::Values(Border(1), Border(10), Border(50)),
+    ALL_BORDER_TYPES,
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA
--- a/modules/cudaarithm/test/test_element_operations.cpp
+++ b/modules/cudaarithm/test/test_element_operations.cpp
--- a/modules/cudaarithm/test/test_main.cpp
+++ b/modules/cudaarithm/test/test_main.cpp
@@ -0,0 +1,45 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+CV_GPU_TEST_MAIN("gpu")
--- a/modules/cudaarithm/test/test_precomp.cpp
+++ b/modules/cudaarithm/test/test_precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
--- a/modules/cudaarithm/test/test_precomp.hpp
+++ b/modules/cudaarithm/test/test_precomp.hpp
@@ -0,0 +1,65 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  if defined __clang__ || defined __APPLE__
+#    pragma GCC diagnostic ignored "-Wmissing-prototypes"
+#    pragma GCC diagnostic ignored "-Wextra"
+#  endif
+#endif
+
+#ifndef __OPENCV_TEST_PRECOMP_HPP__
+#define __OPENCV_TEST_PRECOMP_HPP__
+
+#include <functional>
+
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_test.hpp"
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/imgproc.hpp"
+
+#include "cvconfig.h"
+
+#endif
--- a/modules/cudaarithm/test/test_reductions.cpp
+++ b/modules/cudaarithm/test/test_reductions.cpp
@@ -0,0 +1,819 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+using namespace cvtest;
+
+////////////////////////////////////////////////////////////////////////////////
+// Norm
+
+PARAM_TEST_CASE(Norm, cv::cuda::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int normCode;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        normCode = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Norm, Accuracy)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::cuda::GpuMat d_buf;
+    double val = cv::cuda::norm(loadMat(src, useRoi), normCode, loadMat(mask, useRoi), d_buf);
+
+    double val_gold = cv::norm(src, normCode, mask);
+
+    EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Norm, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_8S),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32S),
+                    MatDepth(CV_32F)),
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// normDiff
+
+PARAM_TEST_CASE(NormDiff, cv::cuda::DeviceInfo, cv::Size, NormCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int normCode;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        normCode = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(NormDiff, Accuracy)
+{
+    cv::Mat src1 = randomMat(size, CV_8UC1);
+    cv::Mat src2 = randomMat(size, CV_8UC1);
+
+    double val = cv::cuda::norm(loadMat(src1, useRoi), loadMat(src2, useRoi), normCode);
+
+    double val_gold = cv::norm(src1, src2, normCode);
+
+    EXPECT_NEAR(val_gold, val, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, NormDiff, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Sum
+
+namespace
+{
+    template <typename T>
+    cv::Scalar absSumImpl(const cv::Mat& src)
+    {
+        const int cn = src.channels();
+
+        cv::Scalar sum = cv::Scalar::all(0);
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                    sum[c] += std::abs(src.at<T>(y, x * cn + c));
+            }
+        }
+
+        return sum;
+    }
+
+    cv::Scalar absSumGold(const cv::Mat& src)
+    {
+        typedef cv::Scalar (*func_t)(const cv::Mat& src);
+
+        static const func_t funcs[] =
+        {
+            absSumImpl<uchar>,
+            absSumImpl<schar>,
+            absSumImpl<ushort>,
+            absSumImpl<short>,
+            absSumImpl<int>,
+            absSumImpl<float>,
+            absSumImpl<double>
+        };
+
+        return funcs[src.depth()](src);
+    }
+
+    template <typename T>
+    cv::Scalar sqrSumImpl(const cv::Mat& src)
+    {
+        const int cn = src.channels();
+
+        cv::Scalar sum = cv::Scalar::all(0);
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                {
+                    const T val = src.at<T>(y, x * cn + c);
+                    sum[c] += val * val;
+                }
+            }
+        }
+
+        return sum;
+    }
+
+    cv::Scalar sqrSumGold(const cv::Mat& src)
+    {
+        typedef cv::Scalar (*func_t)(const cv::Mat& src);
+
+        static const func_t funcs[] =
+        {
+            sqrSumImpl<uchar>,
+            sqrSumImpl<schar>,
+            sqrSumImpl<ushort>,
+            sqrSumImpl<short>,
+            sqrSumImpl<int>,
+            sqrSumImpl<float>,
+            sqrSumImpl<double>
+        };
+
+        return funcs[src.depth()](src);
+    }
+}
+
+PARAM_TEST_CASE(Sum, cv::cuda::DeviceInfo, cv::Size, MatType, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    bool useRoi;
+
+    cv::Mat src;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        src = randomMat(size, type, -128.0, 128.0);
+    }
+};
+
+GPU_TEST_P(Sum, Simple)
+{
+    cv::Scalar val = cv::cuda::sum(loadMat(src, useRoi));
+
+    cv::Scalar val_gold = cv::sum(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+GPU_TEST_P(Sum, Abs)
+{
+    cv::Scalar val = cv::cuda::absSum(loadMat(src, useRoi));
+
+    cv::Scalar val_gold = absSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+GPU_TEST_P(Sum, Sqr)
+{
+    cv::Scalar val = cv::cuda::sqrSum(loadMat(src, useRoi));
+
+    cv::Scalar val_gold = sqrSumGold(src);
+
+    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Sum, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_64F, 1, 4),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MinMax
+
+PARAM_TEST_CASE(MinMax, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MinMax, WithoutMask)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::cuda::minMax(loadMat(src), &minVal, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::cuda::minMax(loadMat(src, useRoi), &minVal, &maxVal);
+
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
+}
+
+GPU_TEST_P(MinMax, WithMask)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::cuda::minMax(loadMat(src), &minVal, &maxVal, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::cuda::minMax(loadMat(src, useRoi), &minVal, &maxVal, loadMat(mask, useRoi));
+
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0, mask);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
+}
+
+GPU_TEST_P(MinMax, NullPtr)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::cuda::minMax(loadMat(src), &minVal, 0);
+            cv::cuda::minMax(loadMat(src), 0, &maxVal);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::cuda::minMax(loadMat(src, useRoi), &minVal, 0);
+        cv::cuda::minMax(loadMat(src, useRoi), 0, &maxVal);
+
+        double minVal_gold, maxVal_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, 0, 0);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, MinMax, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MinMaxLoc
+
+namespace
+{
+    template <typename T>
+    void expectEqualImpl(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
+    {
+        EXPECT_EQ(src.at<T>(loc_gold.y, loc_gold.x), src.at<T>(loc.y, loc.x));
+    }
+
+    void expectEqual(const cv::Mat& src, cv::Point loc_gold, cv::Point loc)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Point loc_gold, cv::Point loc);
+
+        static const func_t funcs[] =
+        {
+            expectEqualImpl<uchar>,
+            expectEqualImpl<schar>,
+            expectEqualImpl<ushort>,
+            expectEqualImpl<short>,
+            expectEqualImpl<int>,
+            expectEqualImpl<float>,
+            expectEqualImpl<double>
+        };
+
+        funcs[src.depth()](src, loc_gold, loc);
+    }
+}
+
+PARAM_TEST_CASE(MinMaxLoc, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MinMaxLoc, WithoutMask)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::cuda::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc);
+
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
+}
+
+GPU_TEST_P(MinMaxLoc, WithMask)
+{
+    cv::Mat src = randomMat(size, depth);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::cuda::minMaxLoc(loadMat(src), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, &maxVal, &minLoc, &maxLoc, loadMat(mask, useRoi));
+
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold, mask);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
+}
+
+GPU_TEST_P(MinMaxLoc, NullPtr)
+{
+    cv::Mat src = randomMat(size, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            double minVal, maxVal;
+            cv::Point minLoc, maxLoc;
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+            cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        double minVal, maxVal;
+        cv::Point minLoc, maxLoc;
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), &minVal, 0, 0, 0);
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, &maxVal, 0, 0);
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, &minLoc, 0);
+        cv::cuda::minMaxLoc(loadMat(src, useRoi), 0, 0, 0, &maxLoc);
+
+        double minVal_gold, maxVal_gold;
+        cv::Point minLoc_gold, maxLoc_gold;
+        minMaxLocGold(src, &minVal_gold, &maxVal_gold, &minLoc_gold, &maxLoc_gold);
+
+        EXPECT_DOUBLE_EQ(minVal_gold, minVal);
+        EXPECT_DOUBLE_EQ(maxVal_gold, maxVal);
+
+        expectEqual(src, minLoc_gold, minLoc);
+        expectEqual(src, maxLoc_gold, maxLoc);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, MinMaxLoc, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////
+// CountNonZero
+
+PARAM_TEST_CASE(CountNonZero, cv::cuda::DeviceInfo, cv::Size, MatDepth, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    bool useRoi;
+
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(CountNonZero, Accuracy)
+{
+    cv::Mat srcBase = randomMat(size, CV_8U, 0.0, 1.5);
+    cv::Mat src;
+    srcBase.convertTo(src, depth);
+
+    if (depth == CV_64F && !supportFeature(devInfo, cv::cuda::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::cuda::countNonZero(loadMat(src));
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        int val = cv::cuda::countNonZero(loadMat(src, useRoi));
+
+        int val_gold = cv::countNonZero(src);
+
+        ASSERT_EQ(val_gold, val);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, CountNonZero, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Reduce
+
+CV_ENUM(ReduceCode, cv::REDUCE_SUM, cv::REDUCE_AVG, cv::REDUCE_MAX, cv::REDUCE_MIN)
+#define ALL_REDUCE_CODES testing::Values(ReduceCode(cv::REDUCE_SUM), ReduceCode(cv::REDUCE_AVG), ReduceCode(cv::REDUCE_MAX), ReduceCode(cv::REDUCE_MIN))
+
+PARAM_TEST_CASE(Reduce, cv::cuda::DeviceInfo, cv::Size, MatDepth, Channels, ReduceCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int channels;
+    int reduceOp;
+    bool useRoi;
+
+    int type;
+    int dst_depth;
+    int dst_type;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        channels = GET_PARAM(3);
+        reduceOp = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        type = CV_MAKE_TYPE(depth, channels);
+
+        if (reduceOp == cv::REDUCE_MAX || reduceOp == cv::REDUCE_MIN)
+            dst_depth = depth;
+        else if (reduceOp == cv::REDUCE_SUM)
+            dst_depth = depth == CV_8U ? CV_32S : depth < CV_64F ? CV_32F : depth;
+        else
+            dst_depth = depth < CV_32F ? CV_32F : depth;
+
+        dst_type = CV_MAKE_TYPE(dst_depth, channels);
+    }
+
+};
+
+GPU_TEST_P(Reduce, Rows)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.cols, 1), dst_type, useRoi);
+    cv::cuda::reduce(loadMat(src, useRoi), dst, 0, reduceOp, dst_depth);
+
+    cv::Mat dst_gold;
+    cv::reduce(src, dst_gold, 0, reduceOp, dst_depth);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
+}
+
+GPU_TEST_P(Reduce, Cols)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(cv::Size(src.rows, 1), dst_type, useRoi);
+    cv::cuda::reduce(loadMat(src, useRoi), dst, 1, reduceOp, dst_depth);
+
+    cv::Mat dst_gold;
+    cv::reduce(src, dst_gold, 1, reduceOp, dst_depth);
+    dst_gold.cols = dst_gold.rows;
+    dst_gold.rows = 1;
+    dst_gold.step = dst_gold.cols * dst_gold.elemSize();
+
+    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Reduce, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_16U),
+                    MatDepth(CV_16S),
+                    MatDepth(CV_32F),
+                    MatDepth(CV_64F)),
+    ALL_CHANNELS,
+    ALL_REDUCE_CODES,
+    WHOLE_SUBMAT));
+
+//////////////////////////////////////////////////////////////////////////////
+// Normalize
+
+PARAM_TEST_CASE(Normalize, cv::cuda::DeviceInfo, cv::Size, MatDepth, NormCode, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int norm_type;
+    bool useRoi;
+
+    double alpha;
+    double beta;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        norm_type = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        alpha = 1;
+        beta = 0;
+    }
+
+};
+
+GPU_TEST_P(Normalize, WithOutMask)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    cv::cuda::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, type);
+
+    cv::Mat dst_gold;
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, type);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+}
+
+GPU_TEST_P(Normalize, WithMask)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    cv::cuda::GpuMat dst = createMat(size, type, useRoi);
+    dst.setTo(cv::Scalar::all(0));
+    cv::cuda::normalize(loadMat(src, useRoi), dst, alpha, beta, norm_type, type, loadMat(mask, useRoi));
+
+    cv::Mat dst_gold(size, type);
+    dst_gold.setTo(cv::Scalar::all(0));
+    cv::normalize(src, dst_gold, alpha, beta, norm_type, type, mask);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-6);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Normalize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    ALL_DEPTH,
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF), NormCode(cv::NORM_MINMAX)),
+    WHOLE_SUBMAT));
+
+////////////////////////////////////////////////////////////////////////////////
+// MeanStdDev
+
+PARAM_TEST_CASE(MeanStdDev, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(MeanStdDev, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_8UC1);
+
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_13))
+    {
+        try
+        {
+            cv::Scalar mean;
+            cv::Scalar stddev;
+            cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
+        }
+    }
+    else
+    {
+        cv::Scalar mean;
+        cv::Scalar stddev;
+        cv::cuda::meanStdDev(loadMat(src, useRoi), mean, stddev);
+
+        cv::Scalar mean_gold;
+        cv::Scalar stddev_gold;
+        cv::meanStdDev(src, mean_gold, stddev_gold);
+
+        EXPECT_SCALAR_NEAR(mean_gold, mean, 1e-5);
+        EXPECT_SCALAR_NEAR(stddev_gold, stddev, 1e-5);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, MeanStdDev, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    WHOLE_SUBMAT));
+
+#endif // HAVE_CUDA