gpuarithm module for arithmetics operations on matrices

2013-04-17 17:39:17 +04:00
parent 1b00a3ed54
commit 31c8b527c6
64 changed files with 6425 additions and 4476 deletions
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,7 +3,7 @@ if(ANDROID OR IOS)
 endif()

 set(the_description "GPU-accelerated Computer Vision")
-ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy opencv_gpuarithm)

 ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

@@ -58,10 +58,6 @@ if(HAVE_CUDA)
    CUDA_ADD_CUFFT_TO_TARGET(${the_module})
  endif()

-  if(HAVE_CUBLAS)
-    CUDA_ADD_CUBLAS_TO_TARGET(${the_module})
-  endif()
-
  install(FILES src/nvidia/NPP_staging/NPP_staging.hpp  src/nvidia/core/NCV.hpp
    DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2/${name}
    COMPONENT main)
--- a/modules/gpu/doc/gpu.rst
+++ b/modules/gpu/doc/gpu.rst
@@ -8,10 +8,7 @@ gpu. GPU-accelerated Computer Vision
    introduction
    initalization_and_information
    data_structures
-    operations_on_matrices
-    per_element_operations
    image_processing
-    matrix_reductions
    object_detection
    feature_detection_and_description
    image_filtering
--- a/modules/gpu/doc/image_processing.rst
+++ b/modules/gpu/doc/image_processing.rst
@@ -414,28 +414,6 @@ The methods support arbitrary permutations of the original channels, including r



-gpu::threshold
------------------
-Applies a fixed-level threshold to each array element.
-
-.. ocv:function:: double gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null())
-
-    :param src: Source array (single-channel).
-
-    :param dst: Destination array with the same size and type as  ``src`` .
-
-    :param thresh: Threshold value.
-
-    :param maxval: Maximum value to use with  ``THRESH_BINARY`` and  ``THRESH_BINARY_INV`` threshold types.
-
-    :param type: Threshold type. For details, see  :ocv:func:`threshold` . The ``THRESH_OTSU`` threshold type is not supported.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`threshold`
-
-
-
 gpu::resize
 ---------------
 Resizes an image.
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
@@ -1,207 +0,0 @@
-Matrix Reductions
-=================
-
-.. highlight:: cpp
-
-
-
-gpu::meanStdDev
-------------------
-Computes a mean value and a standard deviation of matrix elements.
-
-.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev)
-.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)
-
-    :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.
-
-    :param mean: Mean value.
-
-    :param stddev: Standard deviation value.
-
-    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-.. seealso:: :ocv:func:`meanStdDev`
-
-
-
-gpu::norm
-------------
-Returns the norm of a matrix (or difference of two matrices).
-
-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType=NORM_L2)
-
-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, GpuMat& buf)
-
-.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf)
-
-.. ocv:function:: double gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2)
-
-    :param src1: Source matrix. Any matrices except 64F are supported.
-
-    :param src2: Second source matrix (if any) with the same size and type as ``src1``.
-
-    :param normType: Norm type.  ``NORM_L1`` ,  ``NORM_L2`` , and  ``NORM_INF``  are supported for now.
-
-    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
-
-    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-.. seealso:: :ocv:func:`norm`
-
-
-
-gpu::sum
------------
-Returns the sum of matrix elements.
-
-.. ocv:function:: Scalar gpu::sum(const GpuMat& src)
-
-.. ocv:function:: Scalar gpu::sum(const GpuMat& src, GpuMat& buf)
-
-.. ocv:function:: Scalar gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-
-    :param src: Source image of any depth except for ``CV_64F`` .
-
-    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
-
-    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-.. seealso:: :ocv:func:`sum`
-
-
-
-gpu::absSum
---------------
-Returns the sum of absolute values for matrix elements.
-
-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src)
-
-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, GpuMat& buf)
-
-.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-
-    :param src: Source image of any depth except for ``CV_64F`` .
-
-    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
-
-    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-
-
-gpu::sqrSum
---------------
-Returns the squared sum of matrix elements.
-
-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src)
-
-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, GpuMat& buf)
-
-.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-
-    :param src: Source image of any depth except for ``CV_64F`` .
-
-    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
-
-    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-
-
-gpu::minMax
---------------
-Finds global minimum and maximum matrix elements and returns their values.
-
-.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat())
-
-.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
-
-    :param src: Single-channel source image.
-
-    :param minVal: Pointer to the returned minimum value.  Use ``NULL``  if not required.
-
-    :param maxVal: Pointer to the returned maximum value.  Use ``NULL``  if not required.
-
-    :param mask: Optional mask to select a sub-matrix.
-
-    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-The function does not work with ``CV_64F`` images on GPUs with the compute capability < 1.3.
-
-.. seealso:: :ocv:func:`minMaxLoc`
-
-
-
-gpu::minMaxLoc
------------------
-Finds global minimum and maximum matrix elements and returns their values with locations.
-
-.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, const GpuMat& mask=GpuMat())
-
-.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
-
-    :param src: Single-channel source image.
-
-    :param minVal: Pointer to the returned minimum value. Use ``NULL``  if not required.
-
-    :param maxVal: Pointer to the returned maximum value. Use ``NULL``  if not required.
-
-    :param minLoc: Pointer to the returned minimum location. Use ``NULL``  if not required.
-
-    :param maxLoc: Pointer to the returned maximum location. Use ``NULL``  if not required.
-
-    :param mask: Optional mask to select a sub-matrix.
-
-    :param valbuf: Optional values buffer to avoid extra memory allocations. It is resized automatically.
-
-    :param locbuf: Optional locations buffer to avoid extra memory allocations. It is resized automatically.
-
-    The function does not work with ``CV_64F`` images on GPU with the compute capability < 1.3.
-
-.. seealso:: :ocv:func:`minMaxLoc`
-
-
-
-gpu::countNonZero
---------------------
-Counts non-zero matrix elements.
-
-.. ocv:function:: int gpu::countNonZero(const GpuMat& src)
-
-.. ocv:function:: int gpu::countNonZero(const GpuMat& src, GpuMat& buf)
-
-    :param src: Single-channel source image.
-
-    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-The function does not work with ``CV_64F`` images on GPUs with the compute capability < 1.3.
-
-.. seealso:: :ocv:func:`countNonZero`
-
-
-
-gpu::reduce
-----------
-Reduces a matrix to a vector.
-
-.. ocv:function:: void gpu::reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())
-
-    :param mtx: Source 2D matrix.
-
-    :param vec: Destination vector. Its size and type is defined by  ``dim``  and  ``dtype``  parameters.
-
-    :param dim: Dimension index along which the matrix is reduced. 0 means that the matrix is reduced to a single row. 1 means that the matrix is reduced to a single column.
-
-    :param reduceOp: Reduction operation that could be one of the following:
-
-            * **CV_REDUCE_SUM** The output is the sum of all rows/columns of the matrix.
-
-            * **CV_REDUCE_AVG** The output is the mean vector of all rows/columns of the matrix.
-
-            * **CV_REDUCE_MAX** The output is the maximum (column/row-wise) of all rows/columns of the matrix.
-
-            * **CV_REDUCE_MIN** The output is the minimum (column/row-wise) of all rows/columns of the matrix.
-
-    :param dtype: When it is negative, the destination vector will have the same type as the source matrix. Otherwise, its type will be  ``CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), mtx.channels())`` .
-
-The function ``reduce`` reduces the matrix to a vector by treating the matrix rows/columns as a set of 1D vectors and performing the specified operation on the vectors until a single row/column is obtained. For example, the function can be used to compute horizontal and vertical projections of a raster image. In case of ``CV_REDUCE_SUM`` and ``CV_REDUCE_AVG`` , the output may have a larger element bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction modes.
-
-.. seealso:: :ocv:func:`reduce`
--- a/modules/gpu/doc/operations_on_matrices.rst
+++ b/modules/gpu/doc/operations_on_matrices.rst
@@ -1,274 +0,0 @@
-Operations on Matrices
-======================
-
-.. highlight:: cpp
-
-
-
-gpu::gemm
------------------
-Performs generalized matrix multiplication.
-
-.. ocv:function:: void gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null())
-
-    :param src1: First multiplied input matrix that should have  ``CV_32FC1`` , ``CV_64FC1`` , ``CV_32FC2`` , or  ``CV_64FC2``  type.
-
-    :param src2: Second multiplied input matrix of the same type as  ``src1`` .
-
-    :param alpha: Weight of the matrix product.
-
-    :param src3: Third optional delta matrix added to the matrix product. It should have the same type as  ``src1``  and  ``src2`` .
-
-    :param beta: Weight of  ``src3`` .
-
-    :param dst: Destination matrix. It has the proper size and the same type as input matrices.
-
-    :param flags: Operation flags:
-
-            * **GEMM_1_T** transpose  ``src1``
-            * **GEMM_2_T** transpose  ``src2``
-            * **GEMM_3_T** transpose  ``src3``
-
-    :param stream: Stream for the asynchronous version.
-
-The function performs generalized matrix multiplication similar to the ``gemm`` functions in BLAS level 3. For example, ``gemm(src1, src2, alpha, src3, beta, dst, GEMM_1_T + GEMM_3_T)`` corresponds to
-
-.. math::
-
-    \texttt{dst} =  \texttt{alpha} \cdot \texttt{src1} ^T  \cdot \texttt{src2} +  \texttt{beta} \cdot \texttt{src3} ^T
-
-.. note:: Transposition operation doesn't support  ``CV_64FC2``  input type.
-
-.. seealso:: :ocv:func:`gemm`
-
-
-
-gpu::transpose
------------------
-Transposes a matrix.
-
-.. ocv:function:: void gpu::transpose( const GpuMat& src1, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc).
-
-    :param dst: Destination matrix.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`transpose`
-
-
-
-gpu::flip
-------------
-Flips a 2D matrix around vertical, horizontal, or both axes.
-
-.. ocv:function:: void gpu::flip( const GpuMat& a, GpuMat& b, int flipCode, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
-
-    :param b: Destination matrix.
-
-    :param flipCode: Flip mode for the source:
-
-        * ``0`` Flips around x-axis.
-
-        * ``>0`` Flips around y-axis.
-
-        * ``<0`` Flips around both axes.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`flip`
-
-
-
-gpu::LUT
------------
-Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))``
-
-.. ocv:function:: void gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
-
-    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
-
-    :param dst: Destination matrix with the same depth as  ``lut``  and the same number of channels as  ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`LUT`
-
-
-
-gpu::merge
--------------
-Makes a multi-channel matrix out of several single-channel matrices.
-
-.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Array/vector of source matrices.
-
-    :param n: Number of source matrices.
-
-    :param dst: Destination matrix.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`merge`
-
-
-
-gpu::split
--------------
-Copies each plane of a multi-channel matrix into an array.
-
-.. ocv:function:: void gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix.
-
-    :param dst: Destination array/vector of single-channel matrices.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`split`
-
-
-
-gpu::magnitude
------------------
-Computes magnitudes of complex matrix elements.
-
-.. ocv:function:: void gpu::magnitude( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
-
-    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
-
-    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
-
-    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
-
-    :param magnitude: Destination matrix of float magnitudes ( ``CV_32FC1`` ).
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`magnitude`
-
-
-
-gpu::magnitudeSqr
---------------------
-Computes squared magnitudes of complex matrix elements.
-
-.. ocv:function:: void gpu::magnitudeSqr( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
-
-    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
-
-    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
-
-    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
-
-    :param magnitude: Destination matrix of float magnitude squares ( ``CV_32FC1`` ).
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::phase
--------------
-Computes polar angles of complex matrix elements.
-
-.. ocv:function:: void gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
-
-    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
-
-    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
-
-    :param angle: Destination matrix of angles ( ``CV_32FC1`` ).
-
-    :param angleInDegrees: Flag for angles that must be evaluated in degrees.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`phase`
-
-
-
-gpu::cartToPolar
--------------------
-Converts Cartesian coordinates into polar.
-
-.. ocv:function:: void gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
-
-    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
-
-    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
-
-    :param magnitude: Destination matrix of float magnitudes ( ``CV_32FC1`` ).
-
-    :param angle: Destination matrix of angles ( ``CV_32FC1`` ).
-
-    :param angleInDegrees: Flag for angles that must be evaluated in degrees.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`cartToPolar`
-
-
-
-gpu::polarToCart
--------------------
-Converts polar coordinates into Cartesian.
-
-.. ocv:function:: void gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees=false, Stream& stream = Stream::Null())
-
-    :param magnitude: Source matrix containing magnitudes ( ``CV_32FC1`` ).
-
-    :param angle: Source matrix containing angles ( ``CV_32FC1`` ).
-
-    :param x: Destination matrix of real components ( ``CV_32FC1`` ).
-
-    :param y: Destination matrix of imaginary components ( ``CV_32FC1`` ).
-
-    :param angleInDegrees: Flag that indicates angles in degrees.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`polarToCart`
-
-
-
-gpu::normalize
--------------
-Normalizes the norm or value range of an array.
-
-.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat())
-
-.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-
-    :param src: input array.
-
-    :param dst: output array of the same size as  ``src`` .
-
-    :param alpha: norm value to normalize to or the lower range boundary in case of the range normalization.
-
-    :param beta: upper range boundary in case of the range normalization; it is not used for the norm normalization.
-
-    :param normType: normalization type (see the details below).
-
-    :param dtype: when negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
-
-    :param mask: optional operation mask.
-
-    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
-
-.. seealso:: :ocv:func:`normalize`
--- a/modules/gpu/doc/per_element_operations.rst
+++ b/modules/gpu/doc/per_element_operations.rst
@@ -1,445 +0,0 @@
-Per-element Operations
-=======================
-
-.. highlight:: cpp
-
-
-
-gpu::add
------------
-Computes a matrix-matrix or matrix-scalar sum.
-
-.. ocv:function:: void gpu::add( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::add( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
-
-    :param sc: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
-
-    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
-
-    :param dtype: Optional depth of the output array.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`add`
-
-
-
-gpu::subtract
-----------------
-Computes a matrix-matrix or matrix-scalar difference.
-
-.. ocv:function:: void gpu::subtract( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::subtract( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
-
-    :param sc: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
-
-    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
-
-    :param dtype: Optional depth of the output array.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`subtract`
-
-
-
-gpu::multiply
-----------------
-Computes a matrix-matrix or matrix-scalar per-element product.
-
-.. ocv:function:: void gpu::multiply( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::multiply( const GpuMat& a, const Scalar& sc, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix to be multiplied by ``a`` elements.
-
-    :param sc: A scalar to be multiplied by ``a`` elements.
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
-
-    :param scale: Optional scale factor.
-
-    :param dtype: Optional depth of the output array.
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`multiply`
-
-
-
-gpu::divide
-----------
-Computes a matrix-matrix or matrix-scalar division.
-
-.. ocv:function:: void gpu::divide( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::divide( double scale, const GpuMat& b, GpuMat& c, int dtype=-1, Stream& stream=Stream::Null() )
-
-    :param a: First source matrix or a scalar.
-
-    :param b: Second source matrix. The ``a`` elements are divided by it.
-
-    :param sc: A scalar to be divided by the elements of ``a`` matrix.
-
-    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
-
-    :param scale: Optional scale factor.
-
-    :param dtype: Optional depth of the output array.
-
-    :param stream: Stream for the asynchronous version.
-
-This function, in contrast to :ocv:func:`divide`, uses a round-down rounding mode.
-
-.. seealso:: :ocv:func:`divide`
-
-
-gpu::addWeighted
----------------
-Computes the weighted sum of two arrays.
-
-.. ocv:function:: void gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null())
-
-    :param src1: First source array.
-
-    :param alpha: Weight for the first array elements.
-
-    :param src2: Second source array of the same size and channel number as  ``src1`` .
-
-    :param beta: Weight for the second array elements.
-
-    :param dst: Destination array that has the same size and number of channels as the input arrays.
-
-    :param gamma: Scalar added to each sum.
-
-    :param dtype: Optional depth of the destination array. When both input arrays have the same depth, ``dtype`` can be set to ``-1``, which will be equivalent to ``src1.depth()``.
-
-    :param stream: Stream for the asynchronous version.
-
-The function ``addWeighted`` calculates the weighted sum of two arrays as follows:
-
-.. math::
-
-    \texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )
-
-where ``I`` is a multi-dimensional index of array elements. In case of multi-channel arrays, each channel is processed independently.
-
-.. seealso:: :ocv:func:`addWeighted`
-
-
-
-gpu::abs
------------
-Computes an absolute value of each matrix element.
-
-.. ocv:function:: void gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`abs`
-
-
-
-gpu::sqr
------------
-Computes a square value of each matrix element.
-
-.. ocv:function:: void gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::sqrt
------------
-Computes a square root of each matrix element.
-
-.. ocv:function:: void gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`sqrt`
-
-
-
-gpu::exp
------------
-Computes an exponent of each matrix element.
-
-.. ocv:function:: void gpu::exp( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param b: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`exp`
-
-
-
-gpu::log
------------
-Computes a natural logarithm of absolute value of each matrix element.
-
-.. ocv:function:: void gpu::log( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
-
-    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
-
-    :param b: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`log`
-
-
-
-gpu::pow
------------
-Raises every matrix element to a power.
-
-.. ocv:function:: void gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src: Source matrix. Supports all type, except ``CV_64F`` depth.
-
-    :param power: Exponent of power.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-The function ``pow`` raises every element of the input matrix to ``p`` :
-
-.. math::
-
-    \texttt{dst} (I) =  \fork{\texttt{src}(I)^p}{if \texttt{p} is integer}{|\texttt{src}(I)|^p}{otherwise}
-
-.. seealso:: :ocv:func:`pow`
-
-
-
-gpu::absdiff
----------------
-Computes per-element absolute difference of two matrices (or of a matrix and scalar).
-
-.. ocv:function:: void gpu::absdiff( const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::absdiff( const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream=Stream::Null() )
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix to be added to ``a`` .
-
-    :param s: A scalar to be added to ``a`` .
-
-    :param c: Destination matrix with the same size and type as ``a`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`absdiff`
-
-
-
-gpu::compare
----------------
-Compares elements of two matrices.
-
-.. ocv:function:: void gpu::compare( const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream=Stream::Null() )
-
-.. ocv:function:: void gpu::compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null())
-
-    :param a: First source matrix.
-
-    :param b: Second source matrix with the same size and type as ``a`` .
-
-    :param sc: A scalar to be compared with ``a`` .
-
-    :param c: Destination matrix with the same size as ``a`` and the ``CV_8UC1`` type.
-
-    :param cmpop: Flag specifying the relation between the elements to be checked:
-
-            * **CMP_EQ:** ``a(.) == b(.)``
-            * **CMP_GT:** ``a(.) < b(.)``
-            * **CMP_GE:** ``a(.) <= b(.)``
-            * **CMP_LT:** ``a(.) < b(.)``
-            * **CMP_LE:** ``a(.) <= b(.)``
-            * **CMP_NE:** ``a(.) != b(.)``
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`compare`
-
-
-
-gpu::bitwise_not
--------------------
-Performs a per-element bitwise inversion.
-
-.. ocv:function:: void gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-
-    :param src: Source matrix.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_or
-------------------
-Performs a per-element bitwise disjunction of two matrices or of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_and
--------------------
-Performs a per-element bitwise conjunction of two matrices or of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::bitwise_xor
--------------------
-Performs a per-element bitwise ``exclusive or`` operation of two matrices of matrix and scalar.
-
-.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
-.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix with the same size and type as ``src1`` .
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param mask: Optional operation mask. 8-bit single channel image.
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::rshift
--------------------
-Performs pixel by pixel right shift of an image by a constant value.
-
-.. ocv:function:: void gpu::rshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
-
-    :param sc: Constant values, one per channel.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::lshift
--------------------
-Performs pixel by pixel right left of an image by a constant value.
-
-.. ocv:function:: void gpu::lshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
-
-    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
-
-    :param sc: Constant values, one per channel.
-
-    :param dst: Destination matrix with the same size and type as ``src`` .
-
-    :param stream: Stream for the asynchronous version.
-
-
-
-gpu::min
------------
-Computes the per-element minimum of two matrices (or a matrix and a scalar).
-
-.. ocv:function:: void gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`min`
-
-
-
-gpu::max
------------
-Computes the per-element maximum of two matrices (or a matrix and a scalar).
-
-.. ocv:function:: void gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-.. ocv:function:: void gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
-
-    :param src1: First source matrix.
-
-    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
-
-    :param dst: Destination matrix with the same size and type as ``src1`` .
-
-    :param stream: Stream for the asynchronous version.
-
-.. seealso:: :ocv:func:`max`
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
@@ -50,6 +50,7 @@
 #endif

 #include "opencv2/core/gpumat.hpp"
+#include "opencv2/gpuarithm.hpp"
 #include "opencv2/imgproc.hpp"
 #include "opencv2/objdetect.hpp"
 #include "opencv2/features2d.hpp"
@@ -269,182 +270,8 @@ CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat&
 //! supports only ksize = 1 and ksize = 3
 CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());

+////////////////////////////// Image processing //////////////////////////////

-////////////////////////////// Arithmetics ///////////////////////////////////
-
-//! implements generalized matrix product algorithm GEMM from BLAS
-CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
-    const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null());
-
-//! transposes the matrix
-//! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc)
-CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! reverses the order of the rows, columns or both in a matrix
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
-CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());
-
-//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
-//! destination array will have the depth type as lut and the same channels number as source
-//! supports CV_8UC1, CV_8UC3 types
-CV_EXPORTS void LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! makes multi-channel array out of several single-channel arrays
-CV_EXPORTS void merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null());
-
-//! copies each plane of a multi-channel array to a dedicated array
-CV_EXPORTS void split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream = Stream::Null());
-
-//! computes magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitude(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of complex (x(i).re, x(i).im) vector
-//! supports only CV_32FC2 type
-CV_EXPORTS void magnitudeSqr(const GpuMat& xy, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes squared magnitude of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null());
-
-//! computes angle (angle(i)) of each (x(i), y(i)) vector
-//! supports only floating-point source
-CV_EXPORTS void phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts Cartesian coordinates to polar
-//! supports only floating-point source
-CV_EXPORTS void cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! converts polar coordinates to Cartesian
-//! supports only floating-point source
-CV_EXPORTS void polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees = false, Stream& stream = Stream::Null());
-
-//! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0,
-                          int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat());
-CV_EXPORTS void normalize(const GpuMat& src, GpuMat& dst, double a, double b,
-                          int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf);
-
-
-//////////////////////////// Per-element operations ////////////////////////////////////
-
-//! adds one matrix to another (c = a + b)
-CV_EXPORTS void add(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! adds scalar to a matrix (c = a + s)
-CV_EXPORTS void add(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-
-//! subtracts one matrix from another (c = a - b)
-CV_EXPORTS void subtract(const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-//! subtracts scalar from a matrix (c = a - s)
-CV_EXPORTS void subtract(const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask = GpuMat(), int dtype = -1, Stream& stream = Stream::Null());
-
-//! computes element-wise weighted product of the two arrays (c = scale * a * b)
-CV_EXPORTS void multiply(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! weighted multiplies matrix to a scalar (c = scale * a * s)
-CV_EXPORTS void multiply(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-
-//! computes element-wise weighted quotient of the two arrays (c = a / b)
-CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! computes element-wise weighted quotient of matrix and scalar (c = a / s)
-CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! computes element-wise weighted reciprocal of an array (dst = scale/src2)
-CV_EXPORTS void divide(double scale, const GpuMat& b, GpuMat& c, int dtype = -1, Stream& stream = Stream::Null());
-
-//! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
-CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
-                            int dtype = -1, Stream& stream = Stream::Null());
-
-//! adds scaled array to another one (dst = alpha*src1 + src2)
-static inline void scaleAdd(const GpuMat& src1, double alpha, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
-{
-    addWeighted(src1, alpha, src2, 1.0, 0.0, dst, -1, stream);
-}
-
-//! computes element-wise absolute difference of two arrays (c = abs(a - b))
-CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream = Stream::Null());
-//! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());
-
-//! computes absolute value of each matrix element
-//! supports CV_16S and CV_32F depth
-CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes square of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes square root of each pixel in an image
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes exponent of each matrix element (b = e**a)
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
-
-//! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
-CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
-
-//! computes power of each matrix element:
-//    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
-//    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
-//! supports all, except depth == CV_64F
-CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! compares elements of two arrays (c = a <cmpop> b)
-CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
-CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
-
-//! performs per-elements bit-wise inversion
-CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise disjunction of two arrays
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise disjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise conjunction of two arrays
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise conjunction of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! calculates per-element bit-wise "exclusive or" operation
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
-//! calculates per-element bit-wise "exclusive or" of array and scalar
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel right shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with integers elements
-CV_EXPORTS void rshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! pixel by pixel left shift of an image by a constant value
-//! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
-CV_EXPORTS void lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of two arrays (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element minimum of array and scalar (dst = min(src1, src2))
-CV_EXPORTS void min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of two arrays (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
-
-//! computes per-element maximum of array and scalar (dst = max(src1, src2))
-CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());

 enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
@@ -453,9 +280,6 @@ enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA
 //! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
 CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null());

-
-////////////////////////////// Image processing //////////////////////////////
-
 //! DST[x,y] = SRC[xmap[x,y],ymap[x,y]]
 //! supports only CV_32FC1 map type
 CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
@@ -521,9 +345,6 @@ CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& strea
 //! Routines for correcting image color gamma
 CV_EXPORTS void gammaCorrection(const GpuMat& src, GpuMat& dst, bool forward = true, Stream& stream = Stream::Null());

-//! applies fixed threshold to the image
-CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
-
 //! resizes the image
 //! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA
 CV_EXPORTS void resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
@@ -794,62 +615,6 @@ private:
    CannyBuf cannyBuf_;
 };

-////////////////////////////// Matrix reductions //////////////////////////////
-
-//! computes mean value and standard deviation of all or selected array elements
-//! supports only CV_8UC1 type
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
-//! buffered version
-CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
-
-//! computes norm of array
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports all matrices except 64F
-CV_EXPORTS double norm(const GpuMat& src1, int normType=NORM_L2);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, GpuMat& buf);
-CV_EXPORTS double norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf);
-
-//! computes norm of the difference between two arrays
-//! supports NORM_INF, NORM_L1, NORM_L2
-//! supports only CV_8UC1 type
-CV_EXPORTS double norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2);
-
-//! computes sum of array elements
-//! supports only single channel images
-CV_EXPORTS Scalar sum(const GpuMat& src);
-CV_EXPORTS Scalar sum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
-
-//! computes sum of array elements absolute values
-//! supports only single channel images
-CV_EXPORTS Scalar absSum(const GpuMat& src);
-CV_EXPORTS Scalar absSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
-
-//! computes squared sum of array elements
-//! supports only single channel images
-CV_EXPORTS Scalar sqrSum(const GpuMat& src);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, GpuMat& buf);
-CV_EXPORTS Scalar sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf);
-
-//! finds global minimum and maximum array elements and returns their values
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf);
-
-//! finds global minimum and maximum array elements and returns their values with locations
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0,
-                          const GpuMat& mask=GpuMat());
-CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                          const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf);
-
-//! counts non-zero array elements
-CV_EXPORTS int countNonZero(const GpuMat& src);
-CV_EXPORTS int countNonZero(const GpuMat& src, GpuMat& buf);
-
-//! reduces a matrix to a vector
-CV_EXPORTS void reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null());
-
-
 ///////////////////////////// Calibration 3D //////////////////////////////////

 CV_EXPORTS void transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec,
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -1,565 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-////////////////////////////////////////////////////////////////////////
-// gemm
-
-void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
-{
-#ifndef HAVE_CUBLAS
-    (void)src1;
-    (void)src2;
-    (void)alpha;
-    (void)src3;
-    (void)beta;
-    (void)dst;
-    (void)flags;
-    (void)stream;
-    CV_Error(cv::Error::StsNotImplemented, "The library was build without CUBLAS");
-#else
-    // CUBLAS works with column-major matrices
-
-    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
-    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
-
-    if (src1.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    bool tr1 = (flags & GEMM_1_T) != 0;
-    bool tr2 = (flags & GEMM_2_T) != 0;
-    bool tr3 = (flags & GEMM_3_T) != 0;
-
-    if (src1.type() == CV_64FC2)
-    {
-        if (tr1 || tr2 || tr3)
-            CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
-    }
-
-    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
-    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
-    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
-    Size dstSize(src2Size.width, src1Size.height);
-
-    CV_Assert(src1Size.width == src2Size.height);
-    CV_Assert(src3.empty() || src3Size == dstSize);
-
-    dst.create(dstSize, src1.type());
-
-    if (beta != 0)
-    {
-        if (src3.empty())
-        {
-            if (stream)
-                stream.enqueueMemSet(dst, Scalar::all(0));
-            else
-                dst.setTo(Scalar::all(0));
-        }
-        else
-        {
-            if (tr3)
-            {
-                transpose(src3, dst, stream);
-            }
-            else
-            {
-                if (stream)
-                    stream.enqueueCopy(src3, dst);
-                else
-                    src3.copyTo(dst);
-            }
-        }
-    }
-
-    cublasHandle_t handle;
-    cublasSafeCall( cublasCreate_v2(&handle) );
-
-    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );
-
-    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );
-
-    const float alphaf = static_cast<float>(alpha);
-    const float betaf = static_cast<float>(beta);
-
-    const cuComplex alphacf = make_cuComplex(alphaf, 0);
-    const cuComplex betacf = make_cuComplex(betaf, 0);
-
-    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
-    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);
-
-    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-    switch (src1.type())
-    {
-    case CV_32FC1:
-        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphaf,
-            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
-            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
-            &betaf,
-            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
-        break;
-
-    case CV_64FC1:
-        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alpha,
-            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
-            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
-            &beta,
-            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
-        break;
-
-    case CV_32FC2:
-        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphacf,
-            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
-            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
-            &betacf,
-            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
-        break;
-
-    case CV_64FC2:
-        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphac,
-            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
-            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
-            &betac,
-            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
-        break;
-    }
-
-    cublasSafeCall( cublasDestroy_v2(handle) );
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////
-// transpose
-
-void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
-{
-    CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8);
-
-    dst.create( src.cols, src.rows, src.type() );
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    if (src.elemSize() == 1)
-    {
-        NppStreamHandler h(stream);
-
-        NppiSize sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-    }
-    else if (src.elemSize() == 4)
-    {
-        NppStStreamHandler h(stream);
-
-        NcvSize32u sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
-            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
-    }
-    else // if (src.elemSize() == 8)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-
-        NppStStreamHandler h(stream);
-
-        NcvSize32u sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
-            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-////////////////////////////////////////////////////////////////////////
-// flip
-
-namespace
-{
-    template<int DEPTH> struct NppTypeTraits;
-    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
-    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
-    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
-    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
-    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
-    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
-    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
-
-    template <int DEPTH> struct NppMirrorFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
-    };
-
-    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
-    {
-        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
-                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
-                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
-{
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
-    static const func_t funcs[6][4] =
-    {
-        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
-        {0,0,0,0},
-        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
-        {0,0,0,0},
-        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
-        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
-    };
-
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
-
-    dst.create(src.size(), src.type());
-
-    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// LUT
-
-void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
-{
-    const int cn = src.channels();
-
-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
-    CV_Assert( lut.depth() == CV_8U );
-    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
-    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
-
-    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
-
-    NppiSize sz;
-    sz.height = src.rows;
-    sz.width = src.cols;
-
-    Mat nppLut;
-    lut.convertTo(nppLut, CV_32S);
-
-    int nValues3[] = {256, 256, 256};
-
-    Npp32s pLevels[256];
-    for (int i = 0; i < 256; ++i)
-        pLevels[i] = i;
-
-    const Npp32s* pLevels3[3];
-
-#if (CUDA_VERSION <= 4020)
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-    GpuMat d_pLevels;
-    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
-    {
-#if (CUDA_VERSION <= 4020)
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
-#else
-        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
-#endif
-    }
-    else
-    {
-        const Npp32s* pValues3[3];
-
-        Mat nppLut3[3];
-        if (nppLut.channels() == 1)
-        {
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
-#else
-            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
-#endif
-        }
-        else
-        {
-            cv::split(nppLut, nppLut3);
-
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = nppLut3[0].ptr<Npp32s>();
-            pValues3[1] = nppLut3[1].ptr<Npp32s>();
-            pValues3[2] = nppLut3[2].ptr<Npp32s>();
-#else
-            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
-            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
-            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
-
-            pValues3[0] = d_nppLut0.ptr<Npp32s>();
-            pValues3[1] = d_nppLut1.ptr<Npp32s>();
-            pValues3[2] = d_nppLut2.ptr<Npp32s>();
-#endif
-        }
-
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-////////////////////////////////////////////////////////////////////////
-// NPP magnitide
-
-namespace
-{
-    typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
-
-    inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
-    {
-        CV_Assert(src.type() == CV_32FC2);
-
-        dst.create(src.size(), CV_32FC1);
-
-        NppiSize sz;
-        sz.width = src.cols;
-        sz.height = src.rows;
-
-        NppStreamHandler h(stream);
-
-        nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// Polar <-> Cart
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mathfunc
-    {
-        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::mathfunc;
-
-        CV_Assert(x.size() == y.size() && x.type() == y.type());
-        CV_Assert(x.depth() == CV_32F);
-
-        if (mag)
-            mag->create(x.size(), x.type());
-        if (angle)
-            angle->create(x.size(), x.type());
-
-        GpuMat x1cn = x.reshape(1);
-        GpuMat y1cn = y.reshape(1);
-        GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
-        GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();
-
-        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
-    }
-
-    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::mathfunc;
-
-        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
-        CV_Assert(mag.depth() == CV_32F);
-
-        x.create(mag.size(), mag.type());
-        y.create(mag.size(), mag.type());
-
-        GpuMat mag1cn = mag.reshape(1);
-        GpuMat angle1cn = angle.reshape(1);
-        GpuMat x1cn = x.reshape(1);
-        GpuMat y1cn = y.reshape(1);
-
-        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
-    }
-}
-
-void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
-{
-    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
-{
-    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
-{
-    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
-{
-    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
-{
-    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// normalize
-
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
-{
-    GpuMat norm_buf;
-    GpuMat cvt_buf;
-    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
-}
-
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-{
-    double scale = 1, shift = 0;
-    if (norm_type == NORM_MINMAX)
-    {
-        double smin = 0, smax = 0;
-        double dmin = std::min(a, b), dmax = std::max(a, b);
-        minMax(src, &smin, &smax, mask, norm_buf);
-        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
-        shift = dmin - smin * scale;
-    }
-    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
-    {
-        scale = norm(src, norm_type, mask, norm_buf);
-        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
-        shift = 0;
-    }
-    else
-    {
-        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
-    }
-
-    if (mask.empty())
-    {
-        src.convertTo(dst, dtype, scale, shift);
-    }
-    else
-    {
-        src.convertTo(cvt_buf, dtype, scale, shift);
-        cvt_buf.copyTo(dst, mask);
-    }
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1,217 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mathfunc
-    {
-        //////////////////////////////////////////////////////////////////////////////////////
-        // Cart <-> Polar
-
-        struct Nothing
-        {
-            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
-            {
-            }
-        };
-        struct Magnitude
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
-            }
-        };
-        struct MagnitudeSqr
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
-            }
-        };
-        struct Atan2
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
-            {
-                float angle = ::atan2f(y_data, x_data);
-                angle += (angle < 0) * 2.0f * CV_PI_F;
-                dst[y * dst_step + x] = scale * angle;
-            }
-        };
-        template <typename Mag, typename Angle>
-        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
-                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float x_data = xptr[y * x_step + x];
-                float y_data = yptr[y * y_step + x];
-
-                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
-                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
-            }
-        }
-
-        struct NonEmptyMag
-        {
-            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
-            {
-                return mag[y * mag_step + x];
-            }
-        };
-        struct EmptyMag
-        {
-            static __device__ __forceinline__ float get(const float*, size_t, int, int)
-            {
-                return 1.0f;
-            }
-        };
-        template <typename Mag>
-        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
-            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float mag_data = Mag::get(mag, mag_step, x, y);
-                float angle_data = angle[y * angle_step + x];
-                float sin_a, cos_a;
-
-                ::sincosf(scale * angle_data, &sin_a, &cos_a);
-
-                xptr[y * x_step + x] = mag_data * cos_a;
-                yptr[y * y_step + x] = mag_data * sin_a;
-            }
-        }
-
-        template <typename Mag, typename Angle>
-        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(x.cols, threads.x);
-            grid.y = divUp(x.rows, threads.y);
-
-            const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
-
-            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
-                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2][2][2] =
-            {
-                {
-                    {
-                        cartToPolar_caller<Magnitude, Atan2>,
-                        cartToPolar_caller<Magnitude, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<MagnitudeSqr, Atan2>,
-                        cartToPolar_caller<MagnitudeSqr, Nothing>,
-                    }
-                },
-                {
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>,
-                    }
-                }
-            };
-
-            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-        }
-
-        template <typename Mag>
-        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(mag.cols, threads.x);
-            grid.y = divUp(mag.rows, threads.y);
-
-            const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
-
-            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
-                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2] =
-            {
-                polarToCart_caller<NonEmptyMag>,
-                polarToCart_caller<EmptyMag>
-            };
-
-            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-        }
-    } // namespace mathfunc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -45,24 +45,20 @@

 #include <cuda_runtime_api.h>
 #include <cufft.h>
-#include <cublas.h>
 #include "NCV.hpp"

 #if defined(__GNUC__)
    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
 #endif

 namespace cv { namespace gpu
 {
    void ncvError(int err, const char *file, const int line, const char *func = "");
    void cufftError(int err, const char *file, const int line, const char *func = "");
-    void cublasError(int err, const char *file, const int line, const char *func = "");
 }}

 static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
@@ -77,10 +73,4 @@ static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const i
        cv::gpu::cufftError(err, file, line, func);
 }

-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUBLAS_STATUS_SUCCESS != err)
-        cv::gpu::cublasError(err, file, line, func);
-}
-
 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -1,511 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace split_merge
-    {
-        template <typename T, size_t elem_size = sizeof(T)>
-        struct TypeTraits
-        {
-            typedef T type;
-            typedef T type2;
-            typedef T type3;
-            typedef T type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 1>
-        {
-            typedef char type;
-            typedef char2 type2;
-            typedef char3 type3;
-            typedef char4 type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 2>
-        {
-            typedef short type;
-            typedef short2 type2;
-            typedef short3 type3;
-            typedef short4 type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 4>
-        {
-            typedef int type;
-            typedef int2 type2;
-            typedef int3 type3;
-            typedef int4 type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 8>
-        {
-            typedef double type;
-            typedef double2 type2;
-            //typedef double3 type3;
-            //typedef double4 type3;
-        };
-
-        typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
-        typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
-
-        //------------------------------------------------------------
-        // Merge
-
-        template <typename T>
-        __global__ void mergeC2_(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            typedef typename TypeTraits<T>::type2 dst_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const T* src0_y = (const T*)(src0 + y * src0_step);
-            const T* src1_y = (const T*)(src1 + y * src1_step);
-            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_type dst_elem;
-                dst_elem.x = src0_y[x];
-                dst_elem.y = src1_y[x];
-                dst_y[x] = dst_elem;
-            }
-        }
-
-
-        template <typename T>
-        __global__ void mergeC3_(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            typedef typename TypeTraits<T>::type3 dst_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const T* src0_y = (const T*)(src0 + y * src0_step);
-            const T* src1_y = (const T*)(src1 + y * src1_step);
-            const T* src2_y = (const T*)(src2 + y * src2_step);
-            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_type dst_elem;
-                dst_elem.x = src0_y[x];
-                dst_elem.y = src1_y[x];
-                dst_elem.z = src2_y[x];
-                dst_y[x] = dst_elem;
-            }
-        }
-
-
-        template <>
-        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double* src0_y = (const double*)(src0 + y * src0_step);
-            const double* src1_y = (const double*)(src1 + y * src1_step);
-            const double* src2_y = (const double*)(src2 + y * src2_step);
-            double* dst_y = (double*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_y[3 * x] = src0_y[x];
-                dst_y[3 * x + 1] = src1_y[x];
-                dst_y[3 * x + 2] = src2_y[x];
-            }
-        }
-
-
-        template <typename T>
-        __global__ void mergeC4_(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 const uchar* src3, size_t src3_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            typedef typename TypeTraits<T>::type4 dst_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const T* src0_y = (const T*)(src0 + y * src0_step);
-            const T* src1_y = (const T*)(src1 + y * src1_step);
-            const T* src2_y = (const T*)(src2 + y * src2_step);
-            const T* src3_y = (const T*)(src3 + y * src3_step);
-            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_type dst_elem;
-                dst_elem.x = src0_y[x];
-                dst_elem.y = src1_y[x];
-                dst_elem.z = src2_y[x];
-                dst_elem.w = src3_y[x];
-                dst_y[x] = dst_elem;
-            }
-        }
-
-
-        template <>
-        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 const uchar* src3, size_t src3_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double* src0_y = (const double*)(src0 + y * src0_step);
-            const double* src1_y = (const double*)(src1 + y * src1_step);
-            const double* src2_y = (const double*)(src2 + y * src2_step);
-            const double* src3_y = (const double*)(src3 + y * src3_step);
-            double2* dst_y = (double2*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
-                dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
-            }
-        }
-
-
-        template <typename T>
-        static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC2_<T><<<grid, block, 0, stream>>>(
-                    src[0].data, src[0].step,
-                    src[1].data, src[1].step,
-                    dst.rows, dst.cols, dst.data, dst.step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC3_<T><<<grid, block, 0, stream>>>(
-                    src[0].data, src[0].step,
-                    src[1].data, src[1].step,
-                    src[2].data, src[2].step,
-                    dst.rows, dst.cols, dst.data, dst.step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC4_<T><<<grid, block, 0, stream>>>(
-                    src[0].data, src[0].step,
-                    src[1].data, src[1].step,
-                    src[2].data, src[2].step,
-                    src[3].data, src[3].step,
-                    dst.rows, dst.cols, dst.data, dst.step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
-                                     int total_channels, size_t elem_size,
-                                     const cudaStream_t& stream)
-        {
-            static MergeFunction merge_func_tbl[] =
-            {
-                mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
-                mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
-                mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
-            };
-
-            size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
-            MergeFunction merge_func = merge_func_tbl[merge_func_id];
-
-            if (merge_func == 0)
-                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
-
-            merge_func(src, dst, stream);
-        }
-
-
-
-        //------------------------------------------------------------
-        // Split
-
-
-        template <typename T>
-        __global__ void splitC2_(const uchar* src, size_t src_step,
-                                int rows, int cols,
-                                uchar* dst0, size_t dst0_step,
-                                uchar* dst1, size_t dst1_step)
-        {
-            typedef typename TypeTraits<T>::type2 src_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const src_type* src_y = (const src_type*)(src + y * src_step);
-            T* dst0_y = (T*)(dst0 + y * dst0_step);
-            T* dst1_y = (T*)(dst1 + y * dst1_step);
-
-            if (x < cols && y < rows)
-            {
-                src_type src_elem = src_y[x];
-                dst0_y[x] = src_elem.x;
-                dst1_y[x] = src_elem.y;
-            }
-        }
-
-
-        template <typename T>
-        __global__ void splitC3_(const uchar* src, size_t src_step,
-                                int rows, int cols,
-                                uchar* dst0, size_t dst0_step,
-                                uchar* dst1, size_t dst1_step,
-                                uchar* dst2, size_t dst2_step)
-        {
-            typedef typename TypeTraits<T>::type3 src_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const src_type* src_y = (const src_type*)(src + y * src_step);
-            T* dst0_y = (T*)(dst0 + y * dst0_step);
-            T* dst1_y = (T*)(dst1 + y * dst1_step);
-            T* dst2_y = (T*)(dst2 + y * dst2_step);
-
-            if (x < cols && y < rows)
-            {
-                src_type src_elem = src_y[x];
-                dst0_y[x] = src_elem.x;
-                dst1_y[x] = src_elem.y;
-                dst2_y[x] = src_elem.z;
-            }
-        }
-
-
-        template <>
-        __global__ void splitC3_<double>(
-                const uchar* src, size_t src_step, int rows, int cols,
-                uchar* dst0, size_t dst0_step,
-                uchar* dst1, size_t dst1_step,
-                uchar* dst2, size_t dst2_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double* src_y = (const double*)(src + y * src_step);
-            double* dst0_y = (double*)(dst0 + y * dst0_step);
-            double* dst1_y = (double*)(dst1 + y * dst1_step);
-            double* dst2_y = (double*)(dst2 + y * dst2_step);
-
-            if (x < cols && y < rows)
-            {
-                dst0_y[x] = src_y[3 * x];
-                dst1_y[x] = src_y[3 * x + 1];
-                dst2_y[x] = src_y[3 * x + 2];
-            }
-        }
-
-
-        template <typename T>
-        __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
-                                uchar* dst0, size_t dst0_step,
-                                uchar* dst1, size_t dst1_step,
-                                uchar* dst2, size_t dst2_step,
-                                uchar* dst3, size_t dst3_step)
-        {
-            typedef typename TypeTraits<T>::type4 src_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const src_type* src_y = (const src_type*)(src + y * src_step);
-            T* dst0_y = (T*)(dst0 + y * dst0_step);
-            T* dst1_y = (T*)(dst1 + y * dst1_step);
-            T* dst2_y = (T*)(dst2 + y * dst2_step);
-            T* dst3_y = (T*)(dst3 + y * dst3_step);
-
-            if (x < cols && y < rows)
-            {
-                src_type src_elem = src_y[x];
-                dst0_y[x] = src_elem.x;
-                dst1_y[x] = src_elem.y;
-                dst2_y[x] = src_elem.z;
-                dst3_y[x] = src_elem.w;
-            }
-        }
-
-
-        template <>
-        __global__ void splitC4_<double>(
-                const uchar* src, size_t src_step, int rows, int cols,
-                uchar* dst0, size_t dst0_step,
-                uchar* dst1, size_t dst1_step,
-                uchar* dst2, size_t dst2_step,
-                uchar* dst3, size_t dst3_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double2* src_y = (const double2*)(src + y * src_step);
-            double* dst0_y = (double*)(dst0 + y * dst0_step);
-            double* dst1_y = (double*)(dst1 + y * dst1_step);
-            double* dst2_y = (double*)(dst2 + y * dst2_step);
-            double* dst3_y = (double*)(dst3 + y * dst3_step);
-
-            if (x < cols && y < rows)
-            {
-                double2 src_elem1 = src_y[2 * x];
-                double2 src_elem2 = src_y[2 * x + 1];
-                dst0_y[x] = src_elem1.x;
-                dst1_y[x] = src_elem1.y;
-                dst2_y[x] = src_elem2.x;
-                dst3_y[x] = src_elem2.y;
-            }
-        }
-
-        template <typename T>
-        static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC2_<T><<<grid, block, 0, stream>>>(
-                    src.data, src.step, src.rows, src.cols,
-                    dst[0].data, dst[0].step,
-                    dst[1].data, dst[1].step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC3_<T><<<grid, block, 0, stream>>>(
-                    src.data, src.step, src.rows, src.cols,
-                    dst[0].data, dst[0].step,
-                    dst[1].data, dst[1].step,
-                    dst[2].data, dst[2].step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC4_<T><<<grid, block, 0, stream>>>(
-                     src.data, src.step, src.rows, src.cols,
-                     dst[0].data, dst[0].step,
-                     dst[1].data, dst[1].step,
-                     dst[2].data, dst[2].step,
-                     dst[3].data, dst[3].step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
-        {
-            static SplitFunction split_func_tbl[] =
-            {
-                splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
-                splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
-                splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
-            };
-
-            size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
-            SplitFunction split_func = split_func_tbl[split_func_id];
-
-            if (split_func == 0)
-                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
-
-            split_func(src, dst, stream);
-        }
-    } // namespace split_merge
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -142,23 +142,6 @@ namespace
    };

    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-
-    //////////////////////////////////////////////////////////////////////////
-    // CUBLAS errors
-
-    const ErrorEntry cublas_errors[] =
-    {
-        error_entry( CUBLAS_STATUS_SUCCESS ),
-        error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
-        error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
-        error_entry( CUBLAS_STATUS_INVALID_VALUE ),
-        error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
-        error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
-        error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
-        error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
-    };
-
-    const int cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
 }

 namespace cv
@@ -176,12 +159,6 @@ namespace cv
            String msg = getErrorString(code, cufft_errors, cufft_error_num);
            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
        }
-
-        void cublasError(int code, const char* file, const int line, const char* func)
-        {
-            String msg = getErrorString(code, cublas_errors, cublas_error_num);
-            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
-        }
    }
 }

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -92,6 +92,7 @@ void cv::gpu::Canny(const GpuMat&, const GpuMat&, CannyBuf&, GpuMat&, double, do
 void cv::gpu::CannyBuf::create(const Size&, int) { throw_no_cuda(); }
 void cv::gpu::CannyBuf::release() { throw_no_cuda(); }
 cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::gpu::CLAHE>(); }
+void cv::gpu::alphaComp(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }

 #else /* !defined (HAVE_CUDA) */

@@ -1672,4 +1673,77 @@ cv::Ptr<cv::gpu::CLAHE> cv::gpu::createCLAHE(double clipLimit, cv::Size tileGrid
    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
 }

+////////////////////////////////////////////////////////////////////////
+// alphaComp
+
+namespace
+{
+    template <int DEPTH> struct NppAlphaCompFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, NppiAlphaOp eAlphaOp);
+    };
+
+    template <int DEPTH, typename NppAlphaCompFunc<DEPTH>::func_t func> struct NppAlphaComp
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize oSizeROI;
+            oSizeROI.width = img1.cols;
+            oSizeROI.height = img2.rows;
+
+            nppSafeCall( func(img1.ptr<npp_t>(), static_cast<int>(img1.step), img2.ptr<npp_t>(), static_cast<int>(img2.step),
+                              dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, eAlphaOp) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream)
+{
+    static const NppiAlphaOp npp_alpha_ops[] = {
+        NPPI_OP_ALPHA_OVER,
+        NPPI_OP_ALPHA_IN,
+        NPPI_OP_ALPHA_OUT,
+        NPPI_OP_ALPHA_ATOP,
+        NPPI_OP_ALPHA_XOR,
+        NPPI_OP_ALPHA_PLUS,
+        NPPI_OP_ALPHA_OVER_PREMUL,
+        NPPI_OP_ALPHA_IN_PREMUL,
+        NPPI_OP_ALPHA_OUT_PREMUL,
+        NPPI_OP_ALPHA_ATOP_PREMUL,
+        NPPI_OP_ALPHA_XOR_PREMUL,
+        NPPI_OP_ALPHA_PLUS_PREMUL,
+        NPPI_OP_ALPHA_PREMUL
+    };
+
+    typedef void (*func_t)(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, NppiAlphaOp eAlphaOp, cudaStream_t stream);
+
+    static const func_t funcs[] =
+    {
+        NppAlphaComp<CV_8U, nppiAlphaComp_8u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_16U, nppiAlphaComp_16u_AC4R>::call,
+        0,
+        NppAlphaComp<CV_32S, nppiAlphaComp_32s_AC4R>::call,
+        NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
+    };
+
+    CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
+    CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
+
+    dst.create(img1.size(), img1.type());
+
+    const func_t func = funcs[img1.depth()];
+
+    func(img1, img2, dst, npp_alpha_ops[alpha_op], StreamAccessor::getStream(stream));
+}
+
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -1,700 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_no_cuda(); }
-void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_no_cuda(); }
-double cv::gpu::norm(const GpuMat&, int) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, int, const GpuMat&, GpuMat&) { throw_no_cuda(); return 0.0; }
-double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_no_cuda(); return 0.0; }
-Scalar cv::gpu::sum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::absSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::absSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::absSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-Scalar cv::gpu::sqrSum(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); return Scalar(); }
-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::minMax(const GpuMat&, double*, double*, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-int cv::gpu::countNonZero(const GpuMat&) { throw_no_cuda(); return 0; }
-int cv::gpu::countNonZero(const GpuMat&, GpuMat&) { throw_no_cuda(); return 0; }
-void cv::gpu::reduce(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_no_cuda(); }
-
-#else
-#include "opencv2/core/utility.hpp"
-
-namespace
-{
-    class DeviceBuffer
-    {
-    public:
-        explicit DeviceBuffer(int count_ = 1) : count(count_)
-        {
-            cudaSafeCall( cudaMalloc(&pdev, count * sizeof(double)) );
-        }
-        ~DeviceBuffer()
-        {
-            cudaSafeCall( cudaFree(pdev) );
-        }
-
-        operator double*() {return pdev;}
-
-        void download(double* hptr)
-        {
-            double hbuf;
-            cudaSafeCall( cudaMemcpy(&hbuf, pdev, sizeof(double), cudaMemcpyDeviceToHost) );
-            *hptr = hbuf;
-        }
-        void download(double** hptrs)
-        {
-            AutoBuffer<double, 2 * sizeof(double)> hbuf(count);
-            cudaSafeCall( cudaMemcpy((void*)hbuf, pdev, count * sizeof(double), cudaMemcpyDeviceToHost) );
-            for (int i = 0; i < count; ++i)
-                *hptrs[i] = hbuf[i];
-        }
-
-    private:
-        double* pdev;
-        int count;
-    };
-}
-
-
-////////////////////////////////////////////////////////////////////////
-// meanStdDev
-
-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
-{
-    GpuMat buf;
-    meanStdDev(src, mean, stddev, buf);
-}
-
-void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
-{
-    CV_Assert(src.type() == CV_8UC1);
-
-    if (!deviceSupports(FEATURE_SET_COMPUTE_13))
-        CV_Error(cv::Error::StsNotImplemented, "Not sufficient compute capebility");
-
-    NppiSize sz;
-    sz.width  = src.cols;
-    sz.height = src.rows;
-
-    DeviceBuffer dbuf(2);
-
-    int bufSize;
-#if (CUDA_VERSION <= 4020)
-    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
-#else
-    nppSafeCall( nppiMeanStdDevGetBufferHostSize_8u_C1R(sz, &bufSize) );
-#endif
-
-    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
-
-    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
-
-    cudaSafeCall( cudaDeviceSynchronize() );
-
-    double* ptrs[2] = {mean.val, stddev.val};
-    dbuf.download(ptrs);
-}
-
-////////////////////////////////////////////////////////////////////////
-// norm
-
-double cv::gpu::norm(const GpuMat& src, int normType)
-{
-    GpuMat buf;
-    return norm(src, normType, GpuMat(), buf);
-}
-
-double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
-{
-    return norm(src, normType, GpuMat(), buf);
-}
-
-double cv::gpu::norm(const GpuMat& src, int normType, const GpuMat& mask, GpuMat& buf)
-{
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size() && src.channels() == 1));
-
-    GpuMat src_single_channel = src.reshape(1);
-
-    if (normType == NORM_L1)
-        return absSum(src_single_channel, mask, buf)[0];
-
-    if (normType == NORM_L2)
-        return std::sqrt(sqrSum(src_single_channel, mask, buf)[0]);
-
-    // NORM_INF
-    double min_val, max_val;
-    minMax(src_single_channel, &min_val, &max_val, mask, buf);
-    return std::max(std::abs(min_val), std::abs(max_val));
-}
-
-double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
-{
-    CV_Assert(src1.type() == CV_8UC1);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert(normType == NORM_INF || normType == NORM_L1 || normType == NORM_L2);
-
-    typedef NppStatus (*npp_norm_diff_func_t)(const Npp8u* pSrc1, int nSrcStep1, const Npp8u* pSrc2, int nSrcStep2,
-        NppiSize oSizeROI, Npp64f* pRetVal);
-
-    static const npp_norm_diff_func_t npp_norm_diff_func[] = {nppiNormDiff_Inf_8u_C1R, nppiNormDiff_L1_8u_C1R, nppiNormDiff_L2_8u_C1R};
-
-    NppiSize sz;
-    sz.width  = src1.cols;
-    sz.height = src1.rows;
-
-    int funcIdx = normType >> 1;
-
-    double retVal;
-
-    DeviceBuffer dbuf;
-
-    nppSafeCall( npp_norm_diff_func[funcIdx](src1.ptr<Npp8u>(), static_cast<int>(src1.step), src2.ptr<Npp8u>(), static_cast<int>(src2.step), sz, dbuf) );
-
-    cudaSafeCall( cudaDeviceSynchronize() );
-
-    dbuf.download(&retVal);
-
-    return retVal;
-}
-
-////////////////////////////////////////////////////////////////////////
-// Sum
-
-namespace sum
-{
-    void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows);
-
-    template <typename T, int cn>
-    void run(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
-
-    template <typename T, int cn>
-    void runAbs(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
-
-    template <typename T, int cn>
-    void runSqr(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
-}
-
-Scalar cv::gpu::sum(const GpuMat& src)
-{
-    GpuMat buf;
-    return sum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
-{
-    return sum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
-    static const func_t funcs[7][5] =
-    {
-        {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
-        {0, ::sum::run<schar , 1>, ::sum::run<schar , 2>, ::sum::run<schar , 3>, ::sum::run<schar , 4>},
-        {0, ::sum::run<ushort, 1>, ::sum::run<ushort, 2>, ::sum::run<ushort, 3>, ::sum::run<ushort, 4>},
-        {0, ::sum::run<short , 1>, ::sum::run<short , 2>, ::sum::run<short , 3>, ::sum::run<short , 4>},
-        {0, ::sum::run<int   , 1>, ::sum::run<int   , 2>, ::sum::run<int   , 3>, ::sum::run<int   , 4>},
-        {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
-        {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
-    };
-
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
-
-    if (src.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    Size buf_size;
-    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-    buf.setTo(Scalar::all(0));
-
-    const func_t func = funcs[src.depth()][src.channels()];
-
-    double result[4];
-    func(src, buf.data, result, mask);
-
-    return Scalar(result[0], result[1], result[2], result[3]);
-}
-
-Scalar cv::gpu::absSum(const GpuMat& src)
-{
-    GpuMat buf;
-    return absSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
-{
-    return absSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
-    static const func_t funcs[7][5] =
-    {
-        {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
-        {0, ::sum::runAbs<schar , 1>, ::sum::runAbs<schar , 2>, ::sum::runAbs<schar , 3>, ::sum::runAbs<schar , 4>},
-        {0, ::sum::runAbs<ushort, 1>, ::sum::runAbs<ushort, 2>, ::sum::runAbs<ushort, 3>, ::sum::runAbs<ushort, 4>},
-        {0, ::sum::runAbs<short , 1>, ::sum::runAbs<short , 2>, ::sum::runAbs<short , 3>, ::sum::runAbs<short , 4>},
-        {0, ::sum::runAbs<int   , 1>, ::sum::runAbs<int   , 2>, ::sum::runAbs<int   , 3>, ::sum::runAbs<int   , 4>},
-        {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
-        {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
-    };
-
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
-
-    if (src.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    Size buf_size;
-    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-    buf.setTo(Scalar::all(0));
-
-    const func_t func = funcs[src.depth()][src.channels()];
-
-    double result[4];
-    func(src, buf.data, result, mask);
-
-    return Scalar(result[0], result[1], result[2], result[3]);
-}
-
-Scalar cv::gpu::sqrSum(const GpuMat& src)
-{
-    GpuMat buf;
-    return sqrSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
-{
-    return sqrSum(src, GpuMat(), buf);
-}
-
-Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
-{
-    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
-    static const func_t funcs[7][5] =
-    {
-        {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
-        {0, ::sum::runSqr<schar , 1>, ::sum::runSqr<schar , 2>, ::sum::runSqr<schar , 3>, ::sum::runSqr<schar , 4>},
-        {0, ::sum::runSqr<ushort, 1>, ::sum::runSqr<ushort, 2>, ::sum::runSqr<ushort, 3>, ::sum::runSqr<ushort, 4>},
-        {0, ::sum::runSqr<short , 1>, ::sum::runSqr<short , 2>, ::sum::runSqr<short , 3>, ::sum::runSqr<short , 4>},
-        {0, ::sum::runSqr<int   , 1>, ::sum::runSqr<int   , 2>, ::sum::runSqr<int   , 3>, ::sum::runSqr<int   , 4>},
-        {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
-        {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
-    };
-
-    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
-
-    if (src.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    Size buf_size;
-    ::sum::getBufSize(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-    buf.setTo(Scalar::all(0));
-
-    const func_t func = funcs[src.depth()][src.channels()];
-
-    double result[4];
-    func(src, buf.data, result, mask);
-
-    return Scalar(result[0], result[1], result[2], result[3]);
-}
-
-////////////////////////////////////////////////////////////////////////
-// minMax
-
-namespace minMax
-{
-    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
-
-    template <typename T>
-    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-}
-
-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
-{
-    GpuMat buf;
-    minMax(src, minVal, maxVal, mask, buf);
-}
-
-void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
-{
-    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
-    static const func_t funcs[] =
-    {
-        ::minMax::run<uchar>,
-        ::minMax::run<schar>,
-        ::minMax::run<ushort>,
-        ::minMax::run<short>,
-        ::minMax::run<int>,
-        ::minMax::run<float>,
-        ::minMax::run<double>
-    };
-
-    CV_Assert( src.channels() == 1 );
-    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
-
-    if (src.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    Size buf_size;
-    ::minMax::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-
-    const func_t func = funcs[src.depth()];
-
-    double temp1, temp2;
-    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
-}
-
-////////////////////////////////////////////////////////////////////////
-// minMaxLoc
-
-namespace minMaxLoc
-{
-    void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows);
-
-    template <typename T>
-    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
-}
-
-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
-{
-    GpuMat valBuf, locBuf;
-    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
-}
-
-void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
-                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
-{
-    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
-    static const func_t funcs[] =
-    {
-        ::minMaxLoc::run<uchar>,
-        ::minMaxLoc::run<schar>,
-        ::minMaxLoc::run<ushort>,
-        ::minMaxLoc::run<short>,
-        ::minMaxLoc::run<int>,
-        ::minMaxLoc::run<float>,
-        ::minMaxLoc::run<double>
-    };
-
-    CV_Assert( src.channels() == 1 );
-    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
-
-    if (src.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    Size valbuf_size, locbuf_size;
-    ::minMaxLoc::getBufSize(src.cols, src.rows, src.elemSize(), valbuf_size.width, valbuf_size.height, locbuf_size.width, locbuf_size.height);
-    ensureSizeIsEnough(valbuf_size, CV_8U, valBuf);
-    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);
-
-    const func_t func = funcs[src.depth()];
-
-    double temp1, temp2;
-    Point temp3, temp4;
-    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, minLoc ? &minLoc->x : &temp3.x, maxLoc ? &maxLoc->x : &temp4.x, valBuf, locBuf);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// countNonZero
-
-namespace countNonZero
-{
-    void getBufSize(int cols, int rows, int& bufcols, int& bufrows);
-
-    template <typename T>
-    int run(const PtrStepSzb src, PtrStep<unsigned int> buf);
-}
-
-int cv::gpu::countNonZero(const GpuMat& src)
-{
-    GpuMat buf;
-    return countNonZero(src, buf);
-}
-
-int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
-{
-    typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
-    static const func_t funcs[] =
-    {
-        ::countNonZero::run<uchar>,
-        ::countNonZero::run<schar>,
-        ::countNonZero::run<ushort>,
-        ::countNonZero::run<short>,
-        ::countNonZero::run<int>,
-        ::countNonZero::run<float>,
-        ::countNonZero::run<double>
-    };
-
-    CV_Assert(src.channels() == 1);
-
-    if (src.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    Size buf_size;
-    ::countNonZero::getBufSize(src.cols, src.rows, buf_size.width, buf_size.height);
-    ensureSizeIsEnough(buf_size, CV_8U, buf);
-
-    const func_t func = funcs[src.depth()];
-
-    return func(src, buf);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// reduce
-
-namespace reduce
-{
-    template <typename T, typename S, typename D>
-    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-
-    template <typename T, typename S, typename D>
-    void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-}
-
-void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
-{
-    CV_Assert( src.channels() <= 4 );
-    CV_Assert( dim == 0 || dim == 1 );
-    CV_Assert( reduceOp == REDUCE_SUM || reduceOp == REDUCE_AVG || reduceOp == REDUCE_MAX || reduceOp == REDUCE_MIN );
-
-    if (dtype < 0)
-        dtype = src.depth();
-
-    dst.create(1, dim == 0 ? src.cols : src.rows, CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
-
-    if (dim == 0)
-    {
-        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
-        static const func_t funcs[7][7] =
-        {
-            {
-                ::reduce::rows<unsigned char, int, unsigned char>,
-                0/*::reduce::rows<unsigned char, int, signed char>*/,
-                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
-                0/*::reduce::rows<unsigned char, int, short>*/,
-                ::reduce::rows<unsigned char, int, int>,
-                ::reduce::rows<unsigned char, float, float>,
-                ::reduce::rows<unsigned char, double, double>
-            },
-            {
-                0/*::reduce::rows<signed char, int, unsigned char>*/,
-                0/*::reduce::rows<signed char, int, signed char>*/,
-                0/*::reduce::rows<signed char, int, unsigned short>*/,
-                0/*::reduce::rows<signed char, int, short>*/,
-                0/*::reduce::rows<signed char, int, int>*/,
-                0/*::reduce::rows<signed char, float, float>*/,
-                0/*::reduce::rows<signed char, double, double>*/
-            },
-            {
-                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
-                0/*::reduce::rows<unsigned short, int, signed char>*/,
-                ::reduce::rows<unsigned short, int, unsigned short>,
-                0/*::reduce::rows<unsigned short, int, short>*/,
-                ::reduce::rows<unsigned short, int, int>,
-                ::reduce::rows<unsigned short, float, float>,
-                ::reduce::rows<unsigned short, double, double>
-            },
-            {
-                0/*::reduce::rows<short, int, unsigned char>*/,
-                0/*::reduce::rows<short, int, signed char>*/,
-                0/*::reduce::rows<short, int, unsigned short>*/,
-                ::reduce::rows<short, int, short>,
-                ::reduce::rows<short, int, int>,
-                ::reduce::rows<short, float, float>,
-                ::reduce::rows<short, double, double>
-            },
-            {
-                0/*::reduce::rows<int, int, unsigned char>*/,
-                0/*::reduce::rows<int, int, signed char>*/,
-                0/*::reduce::rows<int, int, unsigned short>*/,
-                0/*::reduce::rows<int, int, short>*/,
-                ::reduce::rows<int, int, int>,
-                ::reduce::rows<int, float, float>,
-                ::reduce::rows<int, double, double>
-            },
-            {
-                0/*::reduce::rows<float, float, unsigned char>*/,
-                0/*::reduce::rows<float, float, signed char>*/,
-                0/*::reduce::rows<float, float, unsigned short>*/,
-                0/*::reduce::rows<float, float, short>*/,
-                0/*::reduce::rows<float, float, int>*/,
-                ::reduce::rows<float, float, float>,
-                ::reduce::rows<float, double, double>
-            },
-            {
-                0/*::reduce::rows<double, double, unsigned char>*/,
-                0/*::reduce::rows<double, double, signed char>*/,
-                0/*::reduce::rows<double, double, unsigned short>*/,
-                0/*::reduce::rows<double, double, short>*/,
-                0/*::reduce::rows<double, double, int>*/,
-                0/*::reduce::rows<double, double, float>*/,
-                ::reduce::rows<double, double, double>
-            }
-        };
-
-        const func_t func = funcs[src.depth()][dst.depth()];
-
-        if (!func)
-            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
-
-        func(src.reshape(1), dst.data, reduceOp, StreamAccessor::getStream(stream));
-    }
-    else
-    {
-        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
-        static const func_t funcs[7][7] =
-        {
-            {
-                ::reduce::cols<unsigned char, int, unsigned char>,
-                0/*::reduce::cols<unsigned char, int, signed char>*/,
-                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
-                0/*::reduce::cols<unsigned char, int, short>*/,
-                ::reduce::cols<unsigned char, int, int>,
-                ::reduce::cols<unsigned char, float, float>,
-                ::reduce::cols<unsigned char, double, double>
-            },
-            {
-                0/*::reduce::cols<signed char, int, unsigned char>*/,
-                0/*::reduce::cols<signed char, int, signed char>*/,
-                0/*::reduce::cols<signed char, int, unsigned short>*/,
-                0/*::reduce::cols<signed char, int, short>*/,
-                0/*::reduce::cols<signed char, int, int>*/,
-                0/*::reduce::cols<signed char, float, float>*/,
-                0/*::reduce::cols<signed char, double, double>*/
-            },
-            {
-                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
-                0/*::reduce::cols<unsigned short, int, signed char>*/,
-                ::reduce::cols<unsigned short, int, unsigned short>,
-                0/*::reduce::cols<unsigned short, int, short>*/,
-                ::reduce::cols<unsigned short, int, int>,
-                ::reduce::cols<unsigned short, float, float>,
-                ::reduce::cols<unsigned short, double, double>
-            },
-            {
-                0/*::reduce::cols<short, int, unsigned char>*/,
-                0/*::reduce::cols<short, int, signed char>*/,
-                0/*::reduce::cols<short, int, unsigned short>*/,
-                ::reduce::cols<short, int, short>,
-                ::reduce::cols<short, int, int>,
-                ::reduce::cols<short, float, float>,
-                ::reduce::cols<short, double, double>
-            },
-            {
-                0/*::reduce::cols<int, int, unsigned char>*/,
-                0/*::reduce::cols<int, int, signed char>*/,
-                0/*::reduce::cols<int, int, unsigned short>*/,
-                0/*::reduce::cols<int, int, short>*/,
-                ::reduce::cols<int, int, int>,
-                ::reduce::cols<int, float, float>,
-                ::reduce::cols<int, double, double>
-            },
-            {
-                0/*::reduce::cols<float, float, unsigned char>*/,
-                0/*::reduce::cols<float, float, signed char>*/,
-                0/*::reduce::cols<float, float, unsigned short>*/,
-                0/*::reduce::cols<float, float, short>*/,
-                0/*::reduce::cols<float, float, int>*/,
-                ::reduce::cols<float, float, float>,
-                ::reduce::cols<float, double, double>
-            },
-            {
-                0/*::reduce::cols<double, double, unsigned char>*/,
-                0/*::reduce::cols<double, double, signed char>*/,
-                0/*::reduce::cols<double, double, unsigned short>*/,
-                0/*::reduce::cols<double, double, short>*/,
-                0/*::reduce::cols<double, double, int>*/,
-                0/*::reduce::cols<double, double, float>*/,
-                ::reduce::cols<double, double, double>
-            }
-        };
-
-        const func_t func = funcs[src.depth()][dst.depth()];
-
-        if (!func)
-            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of input and output array formats");
-
-        func(src, dst.data, src.channels(), reduceOp, StreamAccessor::getStream(stream));
-    }
-}
-
-#endif
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -76,10 +76,6 @@
        #include <cufft.h>
    #endif

-    #ifdef HAVE_CUBLAS
-        #include <cublas.h>
-    #endif
-
    #include "internal_shared.hpp"
    #include "opencv2/core/stream_accessor.hpp"

--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@@ -1,171 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::merge(const GpuMat* /*src*/, size_t /*count*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::merge(const std::vector<GpuMat>& /*src*/, GpuMat& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::split(const GpuMat& /*src*/, GpuMat* /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-void cv::gpu::split(const GpuMat& /*src*/, std::vector<GpuMat>& /*dst*/, Stream& /*stream*/) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace split_merge
-    {
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
-    }
-}}}
-
-namespace
-{
-    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream)
-    {
-        using namespace ::cv::gpu::cudev::split_merge;
-
-        CV_Assert(src);
-        CV_Assert(n > 0);
-
-        int depth = src[0].depth();
-        Size size = src[0].size();
-
-        if (depth == CV_64F)
-        {
-            if (!deviceSupports(NATIVE_DOUBLE))
-                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-        }
-
-        bool single_channel_only = true;
-        int total_channels = 0;
-
-        for (size_t i = 0; i < n; ++i)
-        {
-            CV_Assert(src[i].size() == size);
-            CV_Assert(src[i].depth() == depth);
-            single_channel_only = single_channel_only && src[i].channels() == 1;
-            total_channels += src[i].channels();
-        }
-
-        CV_Assert(single_channel_only);
-        CV_Assert(total_channels <= 4);
-
-        if (total_channels == 1)
-            src[0].copyTo(dst);
-        else
-        {
-            dst.create(size, CV_MAKETYPE(depth, total_channels));
-
-            PtrStepSzb src_as_devmem[4];
-            for(size_t i = 0; i < n; ++i)
-                src_as_devmem[i] = src[i];
-
-            PtrStepSzb dst_as_devmem(dst);
-            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
-        }
-    }
-
-    void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream)
-    {
-        using namespace ::cv::gpu::cudev::split_merge;
-
-        CV_Assert(dst);
-
-        int depth = src.depth();
-        int num_channels = src.channels();
-
-        if (depth == CV_64F)
-        {
-            if (!deviceSupports(NATIVE_DOUBLE))
-                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-        }
-
-        if (num_channels == 1)
-        {
-            src.copyTo(dst[0]);
-            return;
-        }
-
-        for (int i = 0; i < num_channels; ++i)
-            dst[i].create(src.size(), depth);
-
-        CV_Assert(num_channels <= 4);
-
-        PtrStepSzb dst_as_devmem[4];
-        for (int i = 0; i < num_channels; ++i)
-            dst_as_devmem[i] = dst[i];
-
-        PtrStepSzb src_as_devmem(src);
-        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
-    }
-}
-
-void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream)
-{
-    ::merge(src, n, dst, StreamAccessor::getStream(stream));
-}
-
-
-void cv::gpu::merge(const std::vector<GpuMat>& src, GpuMat& dst, Stream& stream)
-{
-    ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream)
-{
-    ::split(src, dst, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::split(const GpuMat& src, std::vector<GpuMat>& dst, Stream& stream)
-{
-    dst.resize(src.channels());
-    if(src.channels() > 0)
-        ::split(src, &dst[0], StreamAccessor::getStream(stream));
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp