Merge pull request #1503 from ilya-lavrenov:ocl_arithm

2013-09-26 15:50:54 +04:00 · 2013-09-26 15:50:54 +04:00 · 3e91350a31
commit 3e91350a31
parent 26a1a93295 f5af3ab851
33 changed files with 1866 additions and 11305 deletions
--- a/modules/ocl/doc/operations_on_matrices.rst
+++ b/modules/ocl/doc/operations_on_matrices.rst
@ -7,29 +7,29 @@ ocl::oclMat::convertTo
 ----------------------
 Returns void

-.. ocv:function:: void ocl::oclMat::convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const
+.. ocv:function:: void ocl::oclMat::convertTo(oclMat &m, int rtype, double alpha = 1, double beta = 0) const

-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: the destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.

-    :param rtype: The desired destination matrix type, or rather, the depth(since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.
+    :param rtype: the desired destination matrix type, or rather, the depth (since the number of channels will be the same with the source one). If rtype is negative, the destination matrix will have the same type as the source.

-    :param alpha: must be default now
+    :param alpha: optional scale factor.

-    :param beta: must be default now
+    :param beta: optional delta added to the scaled values.

-The method converts source pixel values to the target datatype. saturate cast is applied in the end to avoid possible overflows. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+The method converts source pixel values to the target datatype. Saturate cast is applied in the end to avoid possible overflows. Supports all data types.

 ocl::oclMat::copyTo
 -------------------
 Returns void

-.. ocv:function:: void ocl::oclMat::copyTo( oclMat &m, const oclMat &mask ) const
+.. ocv:function:: void ocl::oclMat::copyTo(oclMat &m, const oclMat &mask = oclMat()) const

-    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated
+    :param m: The destination matrix. If it does not have a proper size or type before the operation, it will be reallocated.

-    :param mask(optional): The operation mask. Its non-zero elements indicate, which matrix elements need to be copied
+    :param mask: The operation mask. Its non-zero elements indicate, which matrix elements need to be copied.

-Copies the matrix to another one. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4
+Copies the matrix to another one. Supports all data types.

 ocl::oclMat::setTo
 ------------------
@ -37,171 +37,163 @@ Returns oclMat

 .. ocv:function:: oclMat& ocl::oclMat::setTo(const Scalar &s, const oclMat &mask = oclMat())

-    :param s: Assigned scalar, which is converted to the actual array type
+    :param s: Assigned scalar, which is converted to the actual array type.

-    :param mask: The operation mask of the same size as ``*this``
+    :param mask: The operation mask of the same size as ``*this`` and type ``CV_8UC1``.

-Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4.
+Sets all or some of the array elements to the specified value. This is the advanced variant of Mat::operator=(const Scalar s) operator. Supports all data types.

 ocl::absdiff
 ------------------
 Returns void

-.. ocv:function:: void ocl::absdiff( const oclMat& a, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const oclMat& src2, oclMat& dst)

-.. ocv:function:: void ocl::absdiff( const oclMat& a, const Scalar& s, oclMat& c )
+.. ocv:function:: void ocl::absdiff(const oclMat& src1, const Scalar& s, oclMat& dst)

+    :param src1: the first input array.

-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param b: The second input array, must be the same size and same type as a
+    :param s: scalar, the second input parameter.

-    :param s: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as a
-
-Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element absolute difference between two arrays or between array and a scalar. Supports all data types.

 ocl::add
 ------------------
 Returns void

-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c )
+.. ocv:function:: void ocl::add(const oclMat & src1, const oclMat & src2, oclMat & dst, const oclMat & mask = oclMat())

-.. ocv:function:: void ocl::add( const oclMat & a, const oclMat & b, oclMat & c, const oclMat & mask )
+.. ocv:function:: void ocl::add(const oclMat & src1, const Scalar & s, oclMat & dst, const oclMat & mask = oclMat())

-.. ocv:function:: void ocl::add( const oclMat & a, const Scalar & sc, oclMat & c, const oclMat & mask=oclMat() )
+    :param src1: the first input array.

-    :param a: The first input array
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param b: The second input array, must be the same size and same type as src1
+    :param s: scalar, the second input parameter

-    :param sc: Scalar, the second input parameter
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as src1
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
-
-Computes per-element additon between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element additon between two arrays or between array and a scalar. Supports all data types.

 ocl::subtract
 ------------------
 Returns void

-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::subtract( const oclMat& a, const oclMat& b, oclMat& c, const oclMat& mask )
+.. ocv:function:: void ocl::subtract(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::subtract( const oclMat& a, const Scalar& sc, oclMat& c, const oclMat& mask=oclMat() )
+    :param src1: the first input array.

-.. ocv:function:: void ocl::subtract( const Scalar& sc, const oclMat& a, oclMat& c, const oclMat& mask=oclMat() )
+    :param src2: the second input array, must be the same size and same type as ``src1``.

+    :param s: scalar, the second input parameter.

-    :param a: The first input array
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param b: The second input array, must be the same size and same type as src1
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-    :param sc: Scalar, the second input parameter
-
-    :param c: The destination array, it will have the same size and same type as src1
-
-    :param mask: he optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
-
-Computes per-element subtract between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element subtract between two arrays or between array and a scalar. Supports all data types.

 ocl::multiply
 ------------------
 Returns void

-.. ocv:function:: void ocl::multiply( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::multiply(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)

-    :param a: The first input array
+    :param src1: the first input array.

-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param scale: must be 1 now
+    :param scale: optional scale factor.

-Computes per-element multiply between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element multiply between two arrays or between array and a scalar. Supports all data types.

 ocl::divide
 ------------------
 Returns void

-.. ocv:function:: void ocl::divide( const oclMat& a, const oclMat& b, oclMat& c, double scale=1 )
+.. ocv:function:: void ocl::divide(const oclMat& src1, const oclMat& src2, oclMat& dst, double scale = 1)

-.. ocv:function:: void ocl::divide( double scale, const oclMat& b, oclMat& c )
+.. ocv:function:: void ocl::divide(double scale, const oclMat& src1, oclMat& dst)

-    :param a: The first input array
+    :param src1: the first input array.

-    :param b: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param c: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param scale: must be 1 now
+    :param scale: scalar factor.

-Computes per-element divide between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element divide between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_and
 ------------------
 Returns void

-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::bitwise_and( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_and(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())

-    :param src1: The first input array
+    :param src1: the first input array.

-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_and between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_or
 ------------------
 Returns void

-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::bitwise_or( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_or(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())

-    :param src1: The first input array
+    :param src1: the first input array.

-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param s: Scalar, the second input parameter
+    :param s: scalar, the second input parameter.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_or between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_xor
 ------------------
 Returns void

-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const oclMat& src2, oclMat& dst, const oclMat& mask = oclMat())

-.. ocv:function:: void ocl::bitwise_xor( const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask=oclMat() )
+.. ocv:function:: void ocl::bitwise_xor(const oclMat& src1, const Scalar& s, oclMat& dst, const oclMat& mask = oclMat())

-    :param src1: The first input array
+    :param src1: the first input array.

-    :param src2: The second input array, must be the same size and same type as src1
+    :param src2: the second input array, must be the same size and same type as ``src1``.

-    :param sc: Scalar, the second input parameter
+    :param sc: scalar, the second input parameter.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src1``.

-    :param mask: The optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed
+    :param mask: the optional operation mask, 8-bit single channel array; specifies elements of the destination array to be changed.

-Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types except CV_8S.
+Computes per-element bitwise_xor between two arrays or between array and a scalar. Supports all data types.

 ocl::bitwise_not
 ------------------
@ -209,11 +201,11 @@ Returns void

 .. ocv:function:: void ocl::bitwise_not(const oclMat &src, oclMat &dst)

-    :param src: The input array
+    :param src: the input array.

-    :param dst: The destination array, it will have the same size and same type as src1
+    :param dst: the destination array, it will have the same size and same type as ``src``.

-The functions bitwise not compute per-element bit-wise inversion of the source array:. Supports all data types except CV_8S.
+The functions bitwise not compute per-element bit-wise inversion of the source array. Supports all data types.

 ocl::cartToPolar
 ------------------
@ -221,17 +213,17 @@ Returns void

 .. ocv:function:: void ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false)

-    :param x: The array of x-coordinates; must be single-precision or double-precision floating-point array
+    :param x: the array of x-coordinates; must be single-precision or double-precision floating-point array.

-    :param y: The array of y-coordinates; it must have the same size and same type as x
+    :param y: the array of y-coordinates; it must have the same size and same type as ``x``.

-    :param magnitude: The destination array of magnitudes of the same size and same type as x
+    :param magnitude: the destination array of magnitudes of the same size and same type as ``x``.

-    :param angle: The destination array of angles of the same size and same type as x. The angles are measured in radians (0 to 2pi ) or in degrees (0 to 360 degrees).
+    :param angle: the destination array of angles of the same size and same type as ``x``. The angles are measured in radians (0 to 2pi) or in degrees (0 to 360 degrees).

-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.

-Calculates the magnitude and angle of 2d vectors. Supports only CV_32F and CV_64F data types.
+Calculates the magnitude and angle of 2D vectors. Supports only ``CV_32F`` and ``CV_64F`` data types.

 ocl::polarToCart
 ------------------
@ -239,57 +231,57 @@ Returns void

 .. ocv:function:: void ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false)

-    :param magnitude: The source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are =1. If it's not empty, it must have the same size and same type as angle
+    :param magnitude: the source floating-point array of magnitudes of 2D vectors. It can be an empty matrix (=Mat()) - in this case the function assumes that all the magnitudes are = 1. If it's not empty, it must have the same size and same type as ``angle``.

-    :param angle: The source floating-point array of angles of the 2D vectors
+    :param angle: the source floating-point array of angles of the 2D vectors.

-    :param x: The destination array of x-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param x: the destination array of x-coordinates of 2D vectors; will have the same size and the same type as ``angle``.

-    :param y: The destination array of y-coordinates of 2D vectors; will have the same size and the same type as angle
+    :param y: the destination array of y-coordinates of 2D vectors; will have the same size and the same type as ``angle``.

-    :param angleInDegrees: The flag indicating whether the angles are measured in radians, which is default mode, or in degrees
+    :param angleInDegrees: the flag indicating whether the angles are measured in radians, which is default mode, or in degrees.

-The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only CV_32F and CV_64F data types.
+The function polarToCart computes the cartesian coordinates of each 2D vector represented by the corresponding elements of magnitude and angle. Supports only ``CV_32F`` and ``CV_64F`` data types.

 ocl::compare
 ------------------
 Returns void

-.. ocv:function:: void ocl::compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop)
+.. ocv:function:: void ocl::compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop)

-    :param a: The first source array
+    :param src1: the first source array.

-    :param b: The second source array; must have the same size and same type as a
+    :param src2: the second source array; must have the same size and same type as ``src1``.

-    :param c: The destination array; will have the same size as a
+    :param dst: the destination array; will have the same size as ``src1`` and type ``CV_8UC1``.

-    :param cmpop: The flag specifying the relation between the elements to be checked
+    :param cmpop: the flag specifying the relation between the elements to be checked.

-Performs per-element comparison of two arrays or an array and scalar value. Supports all the 1 channel data types except CV_8S.
+Performs per-element comparison of two arrays or an array and scalar value. Supports all data types.

 ocl::exp
 ------------------
 Returns void

-.. ocv:function:: void ocl::exp(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::exp(const oclMat &src, oclMat &dst)

-    :param a: The first source array
+    :param src: the first source array.

-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.

-The function exp calculates the exponent of every element of the input array. Supports only CV_32FC1 data type.
+The function exp calculates the exponent of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.

 ocl::log
 ------------------
 Returns void

-.. ocv:function:: void ocl::log(const oclMat &a, oclMat &b)
+.. ocv:function:: void ocl::log(const oclMat &src, oclMat &dst)

-    :param a: The first source array
+    :param src: the first source array.

-    :param b: The dst array; must have the same size and same type as a
+    :param dst: the dst array; must have the same size and same type as ``src``.

-The function log calculates the log of every element of the input array. Supports only CV_32FC1 data type.
+The function log calculates the log of every element of the input array. Supports only ``CV_32FC1`` and ``CV_64F`` data types.

 ocl::LUT
 ------------------
@ -297,13 +289,13 @@ Returns void

 .. ocv:function:: void ocl::LUT(const oclMat &src, const oclMat &lut, oclMat &dst)

-    :param src: Source array of 8-bit elements
+    :param src: source array of 8-bit elements.

-    :param lut: Look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array
+    :param lut: look-up table of 256 elements. In the case of multi-channel source array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the source array.

-    :param dst: Destination array; will have the same size and the same number of channels as src, and the same depth as lut
+    :param dst: destination array; will have the same size and the same number of channels as ``src``, and the same depth as ``lut``.

-Performs a look-up table transform of an array. Supports only CV_8UC1 and CV_8UC4 data type.
+Performs a look-up table transform of an array.

 ocl::magnitude
 ------------------
@ -311,25 +303,25 @@ Returns void

 .. ocv:function:: void ocl::magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude)

-    :param x: The floating-point array of x-coordinates of the vectors
+    :param x: the floating-point array of x-coordinates of the vectors.

-    :param y: he floating-point array of y-coordinates of the vectors; must have the same size as x
+    :param y: the floating-point array of y-coordinates of the vectors; must have the same size as ``x``.

-    :param magnitude: The destination array; will have the same size and same type as x
+    :param magnitude: the destination array; will have the same size and same type as ``x``.

-The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of x and y arrays. Supports only CV_32F and CV_64F data type.
+The function magnitude calculates magnitude of 2D vectors formed from the corresponding elements of ``x`` and ``y`` arrays. Supports only ``CV_32F`` and ``CV_64F`` data types.

 ocl::flip
 ------------------
 Returns void

-.. ocv:function:: void ocl::flip( const oclMat& a, oclMat& b, int flipCode )
+.. ocv:function:: void ocl::flip(const oclMat& src, oclMat& dst, int flipCode)

-    :param a: Source image.
+    :param src: source image.

-    :param b: Destination image
+    :param dst: destination image.

-    :param flipCode: Specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.
+    :param flipCode: specifies how to flip the array: 0 means flipping around the x-axis, positive (e.g., 1) means flipping around y-axis, and negative (e.g., -1) means flipping around both axes.

 The function flip flips the array in one of three different ways (row and column indices are 0-based). Supports all data types.

@ -339,13 +331,13 @@ Returns void

 .. ocv:function:: void ocl::meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev)

-    :param mtx: Source image.
+    :param mtx: source image.

-    :param mean: The output parameter: computed mean value
+    :param mean: the output parameter: computed mean value.

-    :param stddev: The output parameter: computed standard deviation
+    :param stddev: the output parameter: computed standard deviation.

-The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except CV_32F,CV_64F
+The functions meanStdDev compute the mean and the standard deviation M of array elements, independently for each channel, and return it via the output parameters. Supports all data types except ``CV_32F``, ``CV_64F``.

 ocl::merge
 ------------------
@ -353,9 +345,9 @@ Returns void

 .. ocv:function:: void ocl::merge(const vector<oclMat> &src, oclMat &dst)

-    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type
+    :param src: The source array or vector of the single-channel matrices to be merged. All the matrices in src must have the same size and the same type.

-    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices
+    :param dst: The destination array; will have the same size and the same depth as src, the number of channels will match the number of source matrices.

 Composes a multi-channel array from several single-channel arrays. Supports all data types.

@ -379,13 +371,13 @@ Returns the calculated norm

 .. ocv:function:: double ocl::norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2)

-    :param src1: The first source array
+    :param src1: the first source array.

-    :param src2: The second source array of the same size and the same type as src1
+    :param src2: the second source array of the same size and the same type as ``src1``.

-    :param normType: Type of the norm
+    :param normType: type of the norm.

-Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only CV_8UC1 data type.
+Calculates absolute array norm, absolute difference norm, or relative difference norm. Supports only ``CV_8UC1`` data type.

 ocl::phase
 ------------------
@ -393,15 +385,15 @@ Returns void

 .. ocv:function:: void ocl::phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false)

-    :param x: The source floating-point array of x-coordinates of 2D vectors
+    :param x: the source floating-point array of x-coordinates of 2D vectors

-    :param y: The source array of y-coordinates of 2D vectors; must have the same size and the same type as x
+    :param y: the source array of y-coordinates of 2D vectors; must have the same size and the same type as ``x``.

-    :param angle: The destination array of vector angles; it will have the same size and same type as x
+    :param angle: the destination array of vector angles; it will have the same size and same type as ``x``.

-    :param angleInDegrees: When it is true, the function will compute angle in degrees, otherwise they will be measured in radians
+    :param angleInDegrees: when it is true, the function will compute angle in degrees, otherwise they will be measured in radians.

-The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of x and y. Supports only CV_32FC1 and CV_64FC1 data type.
+The function phase computes the rotation angle of each 2D vector that is formed from the corresponding elements of ``x`` and ``y``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data type.

 ocl::pow
 ------------------
@ -409,13 +401,13 @@ Returns void

 .. ocv:function:: void ocl::pow(const oclMat &x, double p, oclMat &y)

-    :param x: The source array
+    :param x: the source array.

-    :param power: The exponent of power;The source floating-point array of angles of the 2D vectors
+    :param p: the exponent of power; the source floating-point array of angles of the 2D vectors.

-    :param y: The destination array, should be the same type as the source
+    :param y: the destination array, should be the same type as the source.

-The function pow raises every element of the input array to p. Supports only CV_32FC1 and CV_64FC1 data type.
+The function pow raises every element of the input array to ``p``. Supports only ``CV_32FC1`` and ``CV_64FC1`` data types.

 ocl::transpose
 ------------------
@ -423,26 +415,26 @@ Returns void

 .. ocv:function:: void ocl::transpose(const oclMat &src, oclMat &dst)

-    :param src: The source array
+    :param src: the source array.

-    :param dst: The destination array of the same type as src
+    :param dst: the destination array of the same type as ``src``.

-Transposes a matrix. Supports 8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1 data types.
+Transposes a matrix (in case when ``src`` == ``dst`` and matrix is square the operation are performed inplace)


 ocl::dft
 ------------
 Performs a forward or inverse discrete Fourier transform (1D or 2D) of the floating point matrix.

-.. ocv:function:: void ocl::dft( const oclMat& src, oclMat& dst, Size dft_size=Size(0, 0), int flags=0 )
+.. ocv:function:: void ocl::dft(const oclMat& src, oclMat& dst, Size dft_size = Size(), int flags = 0)

-    :param src: Source matrix (real or complex).
+    :param src: source matrix (real or complex).

-    :param dst: Destination matrix (real or complex).
+    :param dst: destination matrix (real or complex).

-    :param dft_size: Size of original input, which is used for transformation from complex to real.
+    :param dft_size: size of original input, which is used for transformation from complex to real.

-    :param flags: Optional flags:
+    :param flags: optional flags:

        * **DFT_ROWS** transforms each individual row of the source matrix.

@ -452,9 +444,9 @@ Performs a forward or inverse discrete Fourier transform (1D or 2D) of the float

        * **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real.

-Use to handle real matrices ( ``CV32FC1`` ) and complex matrices in the interleaved format ( ``CV32FC2`` ).
+Use to handle real matrices (``CV_32FC1``) and complex matrices in the interleaved format (``CV_32FC2``).

-The dft_size must be powers of 2, 3 and 5. Real to complex dft output is not the same with cpu version. real to complex and complex to real does not support DFT_ROWS
+The ``dft_size`` must be powers of ``2``, ``3`` and ``5``. Real to complex dft output is not the same with cpu version. Real to complex and complex to real does not support ``DFT_ROWS``.

 .. seealso:: :ocv:func:`dft`

@ -464,22 +456,22 @@ Performs generalized matrix multiplication.

 .. ocv:function:: void ocl::gemm(const oclMat& src1, const oclMat& src2, double alpha, const oclMat& src3, double beta, oclMat& dst, int flags = 0)

-    :param src1: First multiplied input matrix that should be ``CV_32FC1`` type.
+    :param src1: first multiplied input matrix that should be ``CV_32FC1`` type.

-    :param src2: Second multiplied input matrix of the same type as  ``src1`` .
+    :param src2: second multiplied input matrix of the same type as ``src1``.

-    :param alpha: Weight of the matrix product.
+    :param alpha: weight of the matrix product.

-    :param src3: Third optional delta matrix added to the matrix product. It should have the same type as  ``src1``  and  ``src2`` .
+    :param src3: third optional delta matrix added to the matrix product. It should have the same type as ``src1`` and ``src2``.

-    :param beta: Weight of  ``src3`` .
+    :param beta: weight of ``src3``.

-    :param dst: Destination matrix. It has the proper size and the same type as input matrices.
+    :param dst: destination matrix. It has the proper size and the same type as input matrices.

-    :param flags: Operation flags:
+    :param flags: operation flags:

-            * **GEMM_1_T** transpose  ``src1``
-            * **GEMM_2_T** transpose  ``src2``
+            * **GEMM_1_T** transpose ``src1``.
+            * **GEMM_2_T** transpose ``src2``.

 .. seealso:: :ocv:func:`gemm`

@ -489,28 +481,29 @@ Returns void

 .. ocv:function:: void ocl::sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false)

-    :param keys:   The keys to be used as sorting indices.
+    :param keys: the keys to be used as sorting indices.

-    :param values: The array of values.
+    :param values: the array of values.

-    :param isGreaterThan: Determine sorting order.
+    :param isGreaterThan: determine sorting order.

    :param method: supported sorting methods:
-            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size
-            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys
-            * **SORT_MERGE**     merge sort
-            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``)
+
+            * **SORT_BITONIC**   bitonic sort, only support power-of-2 buffer size.
+            * **SORT_SELECTION** selection sort, currently cannot sort duplicate keys.
+            * **SORT_MERGE**     merge sort.
+            * **SORT_RADIX**     radix sort, only support signed int/float keys(``CV_32S``/``CV_32F``).

 Returns the sorted result of all the elements in values based on equivalent keys.

-The element unit in the values to be sorted is determined from the data type,
-i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.
+The element unit in the values to be sorted is determined from the data type, i.e., a ``CV_32FC2`` input ``{a1a2, b1b2}`` will be considered as two elements, regardless its matrix dimension.

 Both keys and values will be sorted inplace.

-Keys needs to be a **single** channel `oclMat`.
+Keys needs to be a **single** channel ``oclMat``.

 Example::
+
    input -
    keys   = {2,    3,   1}   (CV_8UC1)
    values = {10,5, 4,3, 6,2} (CV_8UC2)
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@ -158,8 +158,8 @@ namespace cv
            static void setContext(Info &oclinfo);

            enum {CL_DOUBLE, CL_UNIFIED_MEM, CL_VER_1_2};
-            bool supportsFeature(int ftype);
-            size_t computeUnits();
+            bool supportsFeature(int ftype) const;
+            size_t computeUnits() const;
            void* oclContext();
            void* oclCommandQueue();
        };
@ -268,13 +268,12 @@ namespace cv

            //! returns deep copy of the oclMatrix, i.e. the data is copied
            oclMat clone() const;
-            //! copies the oclMatrix content to "m".
+
+            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
            // It calls m.create(this->size(), this->type()).
            // It supports any data type
-            void copyTo( oclMat &m ) const;
-            //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements.
-            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
-            void copyTo( oclMat &m, const oclMat &mask ) const;
+            void copyTo( oclMat &m, const oclMat &mask = oclMat()) const;
+
            //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale.
            //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4
            void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const;
@ -409,61 +408,52 @@ namespace cv
        CV_EXPORTS void split(const oclMat &src, vector<oclMat> &dst);

        ////////////////////////////// Arithmetics ///////////////////////////////////
-        //#if defined DOUBLE_SUPPORT
-        //typedef double F;
-        //#else
-        //typedef float F;
-        //#endif
-        //	CV_EXPORTS void addWeighted(const oclMat& a,F  alpha, const oclMat& b,F beta,F gama, oclMat& c);
-        CV_EXPORTS void addWeighted(const oclMat &a, double  alpha, const oclMat &b, double beta, double gama, oclMat &c);
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c);
-        //! adds one matrix to another (c = a + b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
-        //! adds scalar to a matrix (c = a + s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void add(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c);
-        //! subtracts one matrix from another (c = a - b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const oclMat &b, oclMat &c, const oclMat &mask);
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const oclMat &a, const Scalar &sc, oclMat &c, const oclMat &mask = oclMat());
-        //! subtracts scalar from a matrix (c = a - s)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void subtract(const Scalar &sc, const oclMat &a, oclMat &c, const oclMat &mask = oclMat());
-        //! computes element-wise product of the two arrays (c = a * b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void multiply(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
-        //! multiplies matrix to a number (dst = scalar * src)
-        // supports CV_32FC1 only
-        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(const oclMat &a, const oclMat &b, oclMat &c, double scale = 1);
-        //! computes element-wise quotient of the two arrays (c = a / b)
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void divide(double scale, const oclMat &b, oclMat &c);

-        //! compares elements of two arrays (c = a <cmpop> b)
-        // supports except CV_8SC1,CV_8SC2,CV8SC3,CV_8SC4 types
-        CV_EXPORTS void compare(const oclMat &a, const oclMat &b, oclMat &c, int cmpop);
+        //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama)
+        CV_EXPORTS void addWeighted(const oclMat &src1, double  alpha, const oclMat &src2, double beta, double gama, oclMat &dst);
+
+        //! adds one matrix to another (dst = src1 + src2)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! adds scalar to a matrix (dst = src1 + s)
+        // supports all data types
+        CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
+        //! subtracts one matrix from another (dst = src1 - src2)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
+        //! subtracts scalar from a matrix (dst = src1 - s)
+        // supports all data types
+        CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
+        //! computes element-wise product of the two arrays (dst = src1 * scale * src2)
+        // supports all data types
+        CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
+        //! multiplies matrix to a number (dst = scalar * src)
+        // supports all data types
+        CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst);
+
+        //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2)
+        // supports all data types
+        CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1);
+        //! computes element-wise quotient of the two arrays (dst = scale / src)
+        // supports all data types
+        CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst);
+
+        //! compares elements of two arrays (dst = src1 <cmpop> src2)
+        // supports all data types
+        CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop);

        //! transposes the matrix
-        // supports  CV_8UC1, 8UC4, 8SC4, 16UC2, 16SC2, 32SC1 and 32FC1.(the same as cuda)
+        // supports all data types
        CV_EXPORTS void transpose(const oclMat &src, oclMat &dst);

-        //! computes element-wise absolute difference of two arrays (c = abs(a - b))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const oclMat &b, oclMat &c);
-        //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
-        // supports all types except CV_8SC1,CV_8SC2,CV8SC3 and CV_8SC4
-        CV_EXPORTS void absdiff(const oclMat &a, const Scalar &s, oclMat &c);
+        //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst);
+        //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s))
+        // supports all data types
+        CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst);

        //! computes mean value and standard deviation of all or selected array elements
        // supports except CV_32F,CV_64F
@ -481,7 +471,7 @@ namespace cv

        //! reverses the order of the rows, columns or both in a matrix
        // supports all types
-        CV_EXPORTS void flip(const oclMat &a, oclMat &b, int flipCode);
+        CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode);

        //! computes sum of array elements
        // disabled until fix crash
@ -492,13 +482,11 @@ namespace cv

        //! finds global minimum and maximum array elements and returns their values
        // support all C1 types
-
        CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat());
        CV_EXPORTS void minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat& buf);

        //! finds global minimum and maximum array elements and returns their values with locations
        // support all C1 types
-
        CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0,
                                  const oclMat &mask = oclMat());

@ -527,30 +515,27 @@ namespace cv
        //  This is not truly a bilateral filter. Instead of using user provided fixed parameters,
        //  the function calculates a constant at each window based on local standard deviation,
        //  and use this constant to do filtering.
-        //  supports 8UC1 8UC3
+        //  supports 8UC1, 8UC3
        CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT);

-        //! computes exponent of each matrix element (b = e**a)
-        // supports only CV_32FC1 type
-        CV_EXPORTS void exp(const oclMat &a, oclMat &b);
+        //! computes exponent of each matrix element (dst = e**src)
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void exp(const oclMat &src, oclMat &dst);

-        //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
-        // supports only CV_32FC1 type
-        CV_EXPORTS void log(const oclMat &a, oclMat &b);
+        //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src))
+        // supports only CV_32FC1, CV_64FC1 type
+        CV_EXPORTS void log(const oclMat &src, oclMat &dst);

        //! computes magnitude of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
        CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude);
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, const oclMat &y, oclMat &magnitude);
-
-        CV_EXPORTS void magnitudeSqr(const oclMat &x, oclMat &magnitude);

        //! computes angle (angle(i)) of each (x(i), y(i)) vector
-        // supports only CV_32F CV_64F type
+        // supports only CV_32F, CV_64F type
        CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false);

        //! the function raises every element of tne input array to p
-        //! support only CV_32F CV_64F type
+        // support only CV_32F, CV_64F type
        CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y);

        //! converts Cartesian coordinates to polar
@ -564,14 +549,17 @@ namespace cv
        //! perfroms per-elements bit-wise inversion
        // supports all types
        CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst);
+
        //! calculates per-element bit-wise disjunction of two arrays
        // supports all types
        CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
        CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
        //! calculates per-element bit-wise conjunction of two arrays
        // supports all types
        CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
        CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat());
+
        //! calculates per-element bit-wise "exclusive or" operation
        // supports all types
        CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat());
@ -591,12 +579,13 @@ namespace cv
        CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);

        //! computes convolution of two images
-        //! support only CV_32FC1 type
+        // support only CV_32FC1 type
        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result);

        CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);

        CV_EXPORTS void setIdentity(oclMat& src, double val);
+
        //////////////////////////////// Filter Engine ////////////////////////////////

        /*!
@ -988,7 +977,7 @@ namespace cv
        // real to complex dft requires at least v1.8 clAmdFft
        // real to complex dft output is not the same with cpu version
        // real to complex and complex to real does not support DFT_ROWS
-        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0);
+        CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0);

        //! implements generalized matrix product algorithm GEMM from BLAS
        // The functionality requires clAmdBlas library
--- a/modules/ocl/perf/perf_arithm.cpp
+++ b/modules/ocl/perf/perf_arithm.cpp
@ -842,54 +842,6 @@ PERF_TEST_P(PowFixture, pow, OCL_TYPICAL_MAT_SIZES)
        OCL_PERF_ELSE
 }

-///////////// MagnitudeSqr////////////////////////
-
-typedef TestBaseWithParam<Size> MagnitudeSqrFixture;
-
-PERF_TEST_P(MagnitudeSqrFixture, MagnitudeSqr, OCL_TYPICAL_MAT_SIZES)
-{
-    const Size srcSize = GetParam();
-
-    Mat src1(srcSize, CV_32FC1), src2(srcSize, CV_32FC1),
-            dst(srcSize, CV_32FC1);
-    declare.in(src1, src2, WARMUP_RNG).out(dst);
-
-    if (RUN_OCL_IMPL)
-    {
-        ocl::oclMat oclSrc1(src1), oclSrc2(src2), oclDst(srcSize, src1.type());
-
-        OCL_TEST_CYCLE() cv::ocl::magnitudeSqr(oclSrc1, oclSrc2, oclDst);
-
-        oclDst.download(dst);
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else if (RUN_PLAIN_IMPL)
-    {
-        ASSERT_EQ(1, src1.channels());
-
-        TEST_CYCLE()
-        {
-            for (int y = 0; y < srcSize.height; ++y)
-            {
-                const float * const src1Data = reinterpret_cast<float *>(src1.data + src1.step * y);
-                const float * const src2Data = reinterpret_cast<float *>(src2.data + src2.step * y);
-                float * const dstData = reinterpret_cast<float *>(dst.data + dst.step * y);
-                for (int x = 0; x < srcSize.width; ++x)
-                {
-                    float t0 = src1Data[x] * src1Data[x];
-                    float t1 = src2Data[x] * src2Data[x];
-                    dstData[x] = t0 + t1;
-                }
-            }
-        }
-
-        SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
-    }
-    else
-        OCL_PERF_ELSE
-}
-
 ///////////// AddWeighted////////////////////////

 typedef Size_MatType AddWeightedFixture;
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
--- a/modules/ocl/src/filtering.cpp
+++ b/modules/ocl/src/filtering.cpp
@ -130,7 +130,7 @@ public:
    {
        Size src_size = src.size();

-        // Delete those two clause below which exist before, However, the result is alos correct
+        // Delete those two clause below which exist before, However, the result is also correct
        // dst.create(src_size, src.type());
        // dst = Scalar(0.0);

@ -394,23 +394,8 @@ public:
    {
        Filter2DEngine_GPU::apply(src, dst);

-        //if (iters > 1)
-        //{
-        // Size wholesize;
-        // Point ofs;
-        // dst.locateROI(wholesize,ofs);
-        // int rows = dst.rows, cols = dst.cols;
-        // dst.adjustROI(ofs.y,-ofs.y-rows+dst.wholerows,ofs.x,-ofs.x-cols+dst.wholecols);
-        // dst.copyTo(morfBuf);
-        // dst.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // morfBuf.adjustROI(-ofs.y,ofs.y+rows-dst.wholerows,-ofs.x,ofs.x+cols-dst.wholecols);
-        // //morfBuf.create(src.size(),src.type());
-        // //Filter2DEngine_GPU::apply(dst, morfBuf);
-        // //morfBuf.copyTo(dst);
-        //}
        for (int i = 1; i < iters; ++i)
        {
-            //dst.swap(morfBuf);
            Size wholesize;
            Point ofs;
            dst.locateROI(wholesize, ofs);
@ -720,24 +705,16 @@ public:
    virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
    {
        Size src_size = src.size();
-        //int src_type = src.type();

        int cn = src.oclchannels();
-        //dst.create(src_size, src_type);
-        //dst = Scalar(0.0);
-        //dstBuf.create(src_size, src_type);
        dstBuf.create(src_size.height + ksize.height - 1, src_size.width, CV_MAKETYPE(CV_32F, cn));
-        //dstBuf = Scalar(0.0);

        normalizeROI(roi, ksize, anchor, src_size);

        srcROI = src(roi);
        dstROI = dst(roi);
-        //dstBufROI = dstBuf(roi);

        (*rowFilter)(srcROI, dstBuf);
-        //Mat rm(dstBufROI);
-        //std::cout << "rm " << rm << endl;
        (*columnFilter)(dstBuf, dstROI);
    }

@ -1324,11 +1301,8 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
    CV_Assert(src.oclchannels() == dst.oclchannels());
    CV_Assert(ksize == (anchor << 1) + 1);
    int src_pix_per_row, dst_pix_per_row;
-    //int src_offset_x, src_offset_y;
    int dst_offset_in_pixel;
    src_pix_per_row = src.step / src.elemSize();
-    //src_offset_x = (src.offset % src.step) / src.elemSize();
-    //src_offset_y = src.offset / src.step;
    dst_pix_per_row = dst.step / dst.elemSize();
    dst_offset_in_pixel = dst.offset / dst.elemSize();

@ -1340,8 +1314,6 @@ void linearColumnFilter_gpu(const oclMat &src, const oclMat &dst, oclMat mat_ker
    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholecols));
    args.push_back(make_pair(sizeof(cl_int), (void *)&src.wholerows));
    args.push_back(make_pair(sizeof(cl_int), (void *)&src_pix_per_row));
-    //args.push_back(make_pair(sizeof(cl_int),(void*)&src_offset_x));
-    //args.push_back(make_pair(sizeof(cl_int),(void*)&src_offset_y));
    args.push_back(make_pair(sizeof(cl_int), (void *)&dst_pix_per_row));
    args.push_back(make_pair(sizeof(cl_int), (void *)&dst_offset_in_pixel));
    args.push_back(make_pair(sizeof(cl_mem), (void *)&mat_kernel.data));
@ -1360,23 +1332,11 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
        linearColumnFilter_gpu<int>,
        linearColumnFilter_gpu<float>
    };
-    /*
-    CV_Assert(dstType == CV_8UC4 || dstType == CV_8SC4 || dstType == CV_16UC2 ||
-    dstType == CV_16SC2 || dstType == CV_32SC1 || dstType == CV_32FC1);
-    CV_Assert(bufType == CV_8UC4 || bufType == CV_8SC4 || bufType == CV_16UC2 ||
-    bufType == CV_16SC2 || bufType == CV_32SC1 || bufType == CV_32FC1);

-    Mat temp(columnKernel.size(), CV_32SC1);
-    columnKernel.convertTo(temp, CV_32SC1);
-    Mat cont_krnl = temp.reshape(1, 1);
-    */
    Mat temp = columnKernel.reshape(1, 1);
    oclMat mat_kernel(temp);

    int ksize = temp.cols;
-
-    //CV_Assert(ksize < 16);
-
    normalizeAnchor(anchor, ksize);

    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, mat_kernel,
@ -1414,11 +1374,8 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
    }

    if (ddepth < 0)
-    {
        ddepth = src.depth();
-    }

-    //CV_Assert(ddepth == src.depth());
    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));

    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
@ -1445,19 +1402,11 @@ void cv::ocl::Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if (dx == 0)
-        {
            kx *= scale;
-        }
        else
-        {
            ky *= scale;
-        }
    }

-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, borderType);
 }

@ -1471,19 +1420,11 @@ void cv::ocl::Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy,
        // usually the smoothing part is the slowest to compute,
        // so try to scale it instead of the faster differenciating part
        if (dx == 0)
-        {
            kx *= scale;
-        }
        else
-        {
            ky *= scale;
-        }
    }

-    // Mat kx_, ky_;
-    //ky.convertTo(ky_,CV_32S,1<<8);
-    //kx.convertTo(kx_,CV_32S,1<<8);
-
    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype);
 }

@ -1505,9 +1446,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
    Mat kernel(3, 3, CV_32S, (void *)K[ksize == 3]);

    if (scale != 1)
-    {
        kernel *= scale;
-    }

    filter2D(src, dst, ddepth, kernel, Point(-1, -1));
 }
@ -1526,14 +1465,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do

    // automatic detection of kernel size from sigma
    if (ksize.width <= 0 && sigma1 > 0)
-    {
        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }

    if (ksize.height <= 0 && sigma2 > 0)
-    {
        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4) * 2 + 1) | 1;
-    }

    CV_Assert(ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1);

@ -1544,17 +1479,10 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
    Mat ky;

    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
-    {
        ky = kx;
-    }
    else
-    {
        ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
-    }

-    //Mat kx_, ky_;
-    //kx.convertTo(kx_,CV_32S,1<<8);
-    //ky.convertTo(ky_,CV_32S,1<<8);
    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
 }

@ -1585,14 +1513,10 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
    if (bordertype != BORDER_CONSTANT)
    {
        if (src.rows == 1)
-        {
            ksize.height = 1;
-        }

        if (src.cols == 1)
-        {
            ksize.width = 1;
-        }
    }

    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
@ -1618,6 +1542,7 @@ void cv::ocl::adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize
    {
        lut.at<float>(idx++) = sigma2 / (sigma2 + x * x + y * y);
    }
+
    oclMat dlut(lut);
    int depth = src.depth();
    int cn = src.oclchannels();
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@ -244,9 +244,6 @@ namespace cv
                    kernelName = "remapNNF1Constant";
            }

-            //int channels = dst.oclchannels();
-            //int depth = dst.depth();
-            //int type = src.type();
            size_t blkSizeX = 16, blkSizeY = 16;
            size_t glbSizeX;
            int cols = dst.cols;
@ -499,21 +496,13 @@ namespace cv
                openCLExecuteKernel(clCxt, &imgproc_median, kernelName, globalThreads, localThreads, args, src.oclchannels(), src.depth());
            }
            else
-            {
                CV_Error(CV_StsUnsupportedFormat, "Non-supported filter length");
-                //string kernelName = "medianFilter";
-                //args.push_back( make_pair( sizeof(cl_int),(void*)&m));
-
-                //openCLExecuteKernel(clCxt,&imgproc_median,kernelName,globalThreads,localThreads,args,src.oclchannels(),-1);
-            }
-
        }

        ////////////////////////////////////////////////////////////////////////
        // copyMakeBorder
        void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int bordertype, const Scalar &scalar)
        {
-            //CV_Assert(src.oclchannels() != 2);
            CV_Assert(top >= 0 && bottom >= 0 && left >= 0 && right >= 0);
            if((dst.cols != dst.wholecols) || (dst.rows != dst.wholerows)) //has roi
            {
@ -529,10 +518,12 @@ namespace cv
            {
                CV_Assert((src.cols >= left) && (src.cols >= right) && (src.rows >= top) && (src.rows >= bottom));
            }
+
            if(bordertype == cv::BORDER_REFLECT_101)
            {
                CV_Assert((src.cols > left) && (src.cols > right) && (src.rows > top) && (src.rows > bottom));
            }
+
            dst.create(src.rows + top + bottom, src.cols + left + right, src.type());
            int srcStep = src.step1() / src.oclchannels();
            int dstStep = dst.step1() / dst.oclchannels();
@ -732,19 +723,6 @@ namespace cv
            }

            openCLExecuteKernel(src.clCxt, &imgproc_copymakeboder, kernelName, globalThreads, localThreads, args, -1, -1, compile_option);
-            //uchar* cputemp=new uchar[32*dst.wholerows];
-            ////int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
-            //openCLSafeCall(clEnqueueReadBuffer(src.clCxt->impl->clCmdQueue, (cl_mem)dst.data, CL_TRUE,
-            //						0, 32*dst.wholerows, cputemp, 0, NULL, NULL));
-            //for(int i=0;i<dst.wholerows;i++)
-            //{
-            //	for(int j=0;j<dst.wholecols;j++)
-            //	{
-            //		cout<< (int)cputemp[i*32+j]<<" ";
-            //	}
-            //	cout<<endl;
-            //}
-            //delete []cputemp;
        }

        ////////////////////////////////////////////////////////////////////////
@ -1286,11 +1264,6 @@ namespace cv
            if( src.depth() != CV_8U || src.oclchannels() != 4 )
                CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );

-            //            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE))
-            //            {
-            //                CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
-            //            }
-
            dst.create( src.size(), CV_8UC4 );

            if( !(criteria.type & TermCriteria::MAX_ITER) )
--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@ -1013,7 +1013,7 @@ namespace cv
            programCache->releaseProgram();
        }

-        bool Context::supportsFeature(int ftype)
+        bool Context::supportsFeature(int ftype) const
        {
            switch(ftype)
            {
@ -1028,7 +1028,7 @@ namespace cv
            }
        }

-        size_t Context::computeUnits()
+        size_t Context::computeUnits() const
        {
            return impl->maxComputeUnits;
        }
--- a/modules/ocl/src/matrix_operations.cpp
+++ b/modules/ocl/src/matrix_operations.cpp
@ -347,19 +347,14 @@ static void copy_to_with_mask(const oclMat &src, oclMat &dst, const oclMat &mask
                        localThreads, args, -1, -1, compile_option);
 }

-void cv::ocl::oclMat::copyTo( oclMat &m ) const
-{
-    CV_DbgAssert(!this->empty());
-    m.create(size(), type());
-    openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
-                       data, step, cols * elemSize(), rows, offset);
-}
-
 void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const
 {
    if (mask.empty())
    {
-        copyTo(mat);
+        CV_DbgAssert(!this->empty());
+        mat.create(size(), type());
+        openCLCopyBuffer2D(clCxt, mat.data, mat.step, mat.offset,
+                           data, step, cols * elemSize(), rows, offset);
    }
    else
    {
--- a/modules/ocl/src/opencl/arithm_2_mat.cl
+++ b/modules/ocl/src/opencl/arithm_2_mat.cl
@ -1,158 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Shengen Yan,yanshengen@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/**************************************PUBLICFUNC*************************************/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
-#define CV_PI   3.1415926535897932384626433832795
-
-char round_char(double v){
-    char v1=(char)v;
-    return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round_short(double v){
-    short v1=(short)v;
-    return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round_int(double v){
-    int v1=(int)v;
-    return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-char round2_char(double v){
-    char v1=(char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_char_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned char round2_uchar(double v){
-    unsigned char v1=(unsigned char)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-short round2_short(double v){
-    short v1=(short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-unsigned short round2_ushort(double v){
-    unsigned short v1=(unsigned short)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-int round2_int(double v){
-    int v1=(int)v;
-    if((v-v1)==0.5&&v1%2==0)
-        return v1;
-    else
-        return convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-}
-
-/*****************************************EXP***************************************/
-__kernel void arithm_op_exp_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] = (float)exp((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_exp_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = exp(src1Mat[idx]);
-    }
-}
-
-/*****************************************LOG***************************************/
-__kernel void arithm_op_log_5 (int rows,int cols,int srcStep,__global float *src1Mat,
-                             __global float * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 2 ) + x;
-        dstMat[idx] =(float) log((float)src1Mat[idx]);
-    }
-}
-__kernel void arithm_op_log_6 (int rows,int cols,int srcStep,__global double *src1Mat,
-                             __global double * dstMat,int channels)
-{
-    size_t x = get_global_id(0);
-    size_t y = get_global_id(1);
-    if (x < cols && y < rows)
-    {
-        size_t idx = y * ( srcStep >> 3 ) + x;
-        dstMat[idx] = log(src1Mat[idx]);
-    }
-}
--- a/modules/ocl/src/opencl/arithm_LUT.cl
+++ b/modules/ocl/src/opencl/arithm_LUT.cl
@ -38,125 +38,66 @@
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif

-__kernel
-void LUT_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
+__kernel void LUT_C1( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
 {
-    int gidx = get_global_id(0)<<2;
-    int gidy = get_global_id(1);
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
+    int x1 = get_global_id(0);
+    int y = get_global_id(1);

-    __local uchar l[256];
-    l[(lidy<<4)+lidx] = table[(lidy<<4)+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= cols-4?cols-4:gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    p.x = src[src_index];
-    p.y = src[src_index+1];
-    p.z = src[src_index+2];
-    p.w = src[src_index+3];
-
-    q.x = l[p.x];
-    q.y = l[p.y];
-    q.z = l[p.z];
-    q.w = l[p.w];
-    *(__global uchar4*)(dst + dst_index) = q;
-}
-
-__kernel
-void LUT2_C1_D0( __global uchar *dst,
-      __global const uchar *src,
-      __constant uchar *table,
-      int rows,
-      int precols,
-      int channels,
-      int whole_rows,
-      int cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    //int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    __local uchar l[256];
-    l[lidy] = table[lidy+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-
-
-    //clamp(gidx,mask,cols-1);
-    gidx = gidx >= precols ? cols+gidx : gidx;
-    gidy = gidy >= rows?rows-1:gidy;
-
-    int src_index = src_offset + mad24(gidy,src_step,gidx);
-    int dst_index = dst_offset + mad24(gidy,dst_step,gidx);
-    //uchar4 p,q;
-    barrier(CLK_LOCAL_MEM_FENCE);
-    uchar p = src[src_index];
-    uchar q = l[p];
-    dst[dst_index] = q;
-}
-
-__kernel
-void LUT_C4_D0( __global uchar4 *dst,
-      __global uchar4 *src,
-      __constant uchar *table,
-      int rows,
-      int cols,
-      int channels,
-      int whole_rows,
-      int whole_cols,
-      int src_offset,
-      int dst_offset,
-      int lut_offset,
-      int src_step,
-      int dst_step)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int lidx = get_local_id(0);
-    int lidy = get_local_id(1);
-
-    int src_index = mad24(gidy,src_step,gidx+src_offset);
-    int dst_index = mad24(gidy,dst_step,gidx+dst_offset);
-    __local uchar l[256];
-    l[lidy*16+lidx] = table[lidy*16+lidx+lut_offset];
-    //mem_fence(CLK_LOCAL_MEM_FENCE);
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(gidx<cols && gidy<rows)
+    if (x1 < cols1 && y < rows)
    {
-        uchar4 p = src[src_index];
-        uchar4 q;
-        q.x = l[p.x];
-        q.y = l[p.y];
-        q.z = l[p.z];
-        q.w = l[p.w];
-        dst[dst_index] = q;
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index] = lut[lut_offset1 + src[src_index]];
+    }
+}
+
+__kernel void LUT_C2( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
+{
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);
+
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 1)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 1) + 1] : dst[dst_index + 1];
+    }
+}
+
+__kernel void LUT_C4( __global const srcT * src, __global const dstT *lut,
+      __global dstT *dst,
+      int cols1, int rows,
+      int src_offset1,
+      int lut_offset1,
+      int dst_offset1,
+      int src_step1, int dst_step1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if (x1 < cols1 && y < rows)
+    {
+        int src_index = mad24(y, src_step1, src_offset1 + x1);
+        int dst_index = mad24(y, dst_step1, dst_offset1 + x1);
+
+        dst[dst_index    ] =                  lut[lut_offset1 + (src[src_index    ] << 2)    ];
+        dst[dst_index + 1] = x1 + 1 < cols1 ? lut[lut_offset1 + (src[src_index + 1] << 2) + 1] : dst[dst_index + 1];
+        dst[dst_index + 2] = x1 + 2 < cols1 ? lut[lut_offset1 + (src[src_index + 2] << 2) + 2] : dst[dst_index + 2];
+        dst[dst_index + 3] = x1 + 3 < cols1 ? lut[lut_offset1 + (src[src_index + 3] << 2) + 3] : dst[dst_index + 3];
    }
 }
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ b/modules/ocl/src/opencl/arithm_absdiff.cl
@ -1,970 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////absdiff////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************adddiff *************************************/
-__kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                 __global uchar *src2, int src2_step, int src2_offset,
-                                 __global uchar *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-        #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                 __global ushort *src2, int src2_step, int src2_offset,
-                                 __global ushort *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = abs_diff(src1_data, src2_data);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_offset,
-                                 __global short *src2, int src2_step, int src2_offset,
-                                 __global short *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4  dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        ushort4 tmp = abs_diff(src1_data, src2_data);
-        short4  tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_absdiff_D4 (__global int *src1, int src1_step, int src1_offset,
-                                 __global int *src2, int src2_step, int src2_offset,
-                                 __global int *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        uint tmp = abs_diff(data1, data2);
-        int  tmp_data = convert_int_sat(tmp);
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-__kernel void arithm_absdiff_D5 (__global float *src1, int src1_step, int src1_offset,
-                                 __global float *src2, int src2_step, int src2_offset,
-                                 __global float *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = fabs(data1 - data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_absdiff_D6 (__global double *src1, int src1_step, int src1_offset,
-                                 __global double *src2, int src2_step, int src2_offset,
-                                 __global double *dst,  int dst_step,  int dst_offset,
-                                 int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double tmp = fabs(data1-data2);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-#endif
-
-/**************************************absdiff with scalar**************************************/
-__kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data), src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        uint tmp_data = abs_diff(src_data1, src_data2);
-        int  data = convert_int_sat(tmp_data);
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = fabs(src_data1 - src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = fabs(src_data1 - src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data), src2_data));
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = convert_ushort2_sat( abs_diff(convert_int2_sat(src_data1), src_data2));
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        ushort2 tmp = convert_ushort2_sat(abs_diff(convert_int2_sat(src_data1), src_data2));
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(abs_diff(src_data1, src_data2));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = fabs(src_data1 - src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = fabs(src_data1 - src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
-        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
-        uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
-        uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
-        uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
-        int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
-        int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
-        int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
-        uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
-        uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
-        uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
-        uchar4 tmp_data_0 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_0), src2_data_0));
-        uchar4 tmp_data_1 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_1), src2_data_1));
-        uchar4 tmp_data_2 = convert_uchar4_sat(abs_diff(convert_int4_sat(src1_data_2), src2_data_2));
-
-        data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
-        data_0.w   = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_0.w : data_0.w;
-
-        data_1.xy  = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
-                     ? tmp_data_1.xy : data_1.xy;
-        data_1.zw  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.zw : data_1.zw;
-
-        data_2.x   = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.x : data_2.x;
-        data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
-                     ? tmp_data_2.yzw : data_2.yzw;
-
-        *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
-        *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
-        *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
-        ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
-        ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
-        ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
-        ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
-        ushort2 tmp_data_0 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        ushort2 tmp_data_1 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        ushort2 tmp_data_2 = convert_ushort2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
-        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
-        short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
-        short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
-        short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
-        int2 src2_data_0 = (int2)(src2.x, src2.y);
-        int2 src2_data_1 = (int2)(src2.z, src2.x);
-        int2 src2_data_2 = (int2)(src2.y, src2.z);
-
-        short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
-        short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
-        short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
-        short2 tmp_data_0 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_0), src2_data_0));
-        short2 tmp_data_1 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_1), src2_data_1));
-        short2 tmp_data_2 = convert_short2_sat(abs_diff(convert_int2_sat(src1_data_2), src2_data_2));
-
-        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
-        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                    ? tmp_data_1.x : data_1.x;
-        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_1.y : data_1.y;
-
-        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                    ? tmp_data_2.xy : data_2.xy;
-
-        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
-        int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
-        int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
-        int src2_data_0 = src2.x;
-        int src2_data_1 = src2.y;
-        int src2_data_2 = src2.z;
-
-        int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
-        int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
-        int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
-        int tmp_data_0 = convert_int_sat(abs_diff(src1_data_0, src2_data_0));
-        int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
-        int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
-
-        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-__kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 12));
-
-        float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
-        float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
-        float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
-        float src2_data_0 = src2.x;
-        float src2_data_1 = src2.y;
-        float src2_data_2 = src2.z;
-
-        float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
-        float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
-        float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
-        float tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x * 24));
-
-        double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
-        double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
-        double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
-        double src2_data_0 = src2.x;
-        double src2_data_1 = src2.y;
-        double src2_data_2 = src2.z;
-
-        double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
-        double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
-        double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
-        double tmp_data_0 = fabs(src1_data_0 - src2_data_0);
-        double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
-        double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-
-        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-    }
-}
-#endif
-__kernel void arithm_s_absdiff_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                      __global   uchar *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                      __global   ushort *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(abs_diff(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                      __global   int *dst,  int dst_step,  int dst_offset,
-                                      int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(abs_diff(src_data1, src2));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_absdiff_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                      float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = fabs(src_data1 - src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_absdiff_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                      double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = fabs(src_data1 - src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@ -52,809 +52,105 @@
 #endif
 #endif

-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////ADD////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************add without mask**************************************/
-__kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+///////////////////////////////////////////// ADD ////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp      = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
    }
 }
-__kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)

+__kernel void arithm_binary_op_mat_div(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        int4    tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero : convertToT(convertToWT(src1[src1_index]) / convertToWT(src2[src2_index]));
    }
 }
-__kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+
+__kernel void arithm_absdiff_mat(__global T *src1, int src1_step, int src1_offset,
+                         __global T *src2, int src2_step, int src2_offset,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        int4   tmp = ARITHM_OP(convert_int4_sat(src1_data), convert_int4_sat(src2_data));
-        short4 tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
+        WT value = convertToWT(src1[src1_index]) - convertToWT(src2[src2_index]);
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
    }
 }

-__kernel void arithm_add_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+// add mat with scale for multiply
+__kernel void arithm_binary_op_mat_scalar(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        long tmp  = ARITHM_OP((long)(data1), (long)(data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) * scalar[0] * convertToWT(src2[src2_index]));
    }
 }
-__kernel void arithm_add_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+
+// add mat with scale for divide
+__kernel void arithm_binary_op_mat_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                                __global T *src2, int src2_step, int src2_offset,
+                               __global WT *scalar,
+                               __global T *dst, int dst_step,  int dst_offset,
+                               int cols, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = ARITHM_OP(data1, data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+        T zero = (T)(0);
+        dst[dst_index] = src2[src2_index] == zero ? zero :
+            convertToT(convertToWT(src1[src1_index]) * scalar[0] / convertToWT(src2[src2_index]));
    }
 }
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        *((__global double *)((__global char *)dst + dst_index)) = ARITHM_OP(data1, data2);
-    }
-}
-#endif
-
-/**************************************add with mask**************************************/
-__kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4 tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), convert_int2_sat(src2_data));
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C1_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_add_with_mask_C1_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C1_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = *((__global double *)((__global char *)src2 + src2_index));
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        short4   tmp = ARITHM_OP(convert_short4_sat(src1_data), convert_short4_sat(src2_data));
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), convert_int2_sat(src_data2));
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int    *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C2_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = *((__global float2 *)((__global char *)src2 + src2_index));
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = *((__global double2 *)((__global char *)src2 + src2_index));
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                          __global uchar *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global uchar *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_short4_sat(src_data1), convert_short4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                          __global ushort *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global ushort *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
-                                          __global short *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global short *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), convert_int4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D4 (__global int   *src1, int src1_step, int src1_offset,
-                                          __global int   *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global int   *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_add_with_mask_C4_D5 (__global float *src1, int src1_step, int src1_offset,
-                                          __global float *src2, int src2_step, int src2_offset,
-                                          __global uchar *mask, int mask_step, int mask_offset,
-                                          __global float *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 src_data2 = *((__global float4 *)((__global char *)src2 + src2_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C4_D6 (__global double *src1, int src1_step, int src1_offset,
-                                          __global double *src2, int src2_step, int src2_offset,
-                                          __global uchar  *mask, int mask_step, int mask_offset,
-                                          __global double *dst,  int dst_step,  int dst_offset,
-                                          int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 src_data2 = *((__global double4 *)((__global char *)src2 + src2_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@ -42,392 +42,34 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #elif defined (cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
-typedef double F;
-#else
-typedef float F;
 #endif
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                              __global uchar *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global uchar *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
+
+__kernel void addWeighted(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global T * dst, int dst_step1, int dst_offset1,
+                              WT alpha, WT beta, WT gama,
+                              int cols1, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
-
+    if (x < cols1 && y < rows)
    {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);

-        x = x << 2;
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-//        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-        short4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+        dst[dst_index] = convertToT(src1[src1_index]*alpha + src2[src2_index]*beta + gama);
    }
-
 }
-
-
-
-__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                              __global ushort *src2, int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global ushort *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-
-}
-
-
-__kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
-                              __global short *src2,  int src2_step,int src2_offset,
-                              F alpha,F beta,F gama,
-                              __global short *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-        int4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        short4 tmp_data = convert_short4_sat(tmp);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
-                              __global int *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global int *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> bitOfInt) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
-        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            int4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-        float4 tmp;
-        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        int4 tmp_data = convert_int4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
-                              __global float *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global float *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            float4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-
-        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-        float4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-        // float4 tmp_data = convert_float4(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
-                              __global double *src2, int src2_step,int src2_offset,
-                              F alpha,F beta, F gama,
-                              __global double *dst,  int dst_step,int dst_offset,
-                              int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-    {
-
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 3) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
-        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-        if(src1_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            double4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-        double4 tmp_data;
-        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
-        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
-        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
-        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 16 >= dst_start) && (dst_index + 16 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 24 >= dst_start) && (dst_index + 24 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_add_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_mask.cl
@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// add with mask //////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_mat_mask(__global T * src1, int src1_step, int src1_offset,
+                              __global T * src2, int src2_step, int src2_offset,
+                              __global uchar * mask, int mask_step, int mask_offset,
+                              __global T * dst, int dst_step, int dst_offset,
+                              int cols, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index  = mad24(y, dst_step, dst_offset + x);
+
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation convertToWT(src2[src2_index]));
+        }
+    }
+}
--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@ -51,463 +51,61 @@
 #endif
 #endif

-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar without mask**************************************/
-__kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
+///////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////// Add with scalar /////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_binary_op_scalar (__global T *src1, int src1_step, int src1_offset,
+                                 __global WT *scalar,
+                                 __global T *dst,  int dst_step,  int dst_offset,
+                                 int cols, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
    }
 }
-__kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{

+__kernel void arithm_absdiff_scalar(__global T *src1, int src1_step, int src1_offset,
+                         __global WT *src2,
+                         __global T *dst, int dst_step, int dst_offset,
+                         int cols, int rows)
+{
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        x = x << 1;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index  = mad24(y, dst_step, x + dst_offset);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
+        WT value = convertToWT(src1[src1_index]) - src2[0];
+        value = value > (WT)(0) ? value : -value;
+        dst[dst_index] = convertToT(value);
    }
 }
-__kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{

+// scalar divide to matrix
+__kernel void arithm_binary_op_scalar_div(__global T *src1, int src1_step, int src1_offset,
+                               __global WT *scalar,
+                               __global T *dst,  int dst_step,  int dst_offset,
+                               int cols, int rows)
+{
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        x = x << 1;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int dst_index = mad24(y, dst_step, x + dst_offset);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
+        T zero = (T)(0);
+        dst[dst_index] = src1[src1_index] == zero ? zero : convertToT(scalar[0] / convertToWT(src1[src1_index]));
    }
 }
-__kernel void arithm_s_add_C1_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C1_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src2_data = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src2_data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4 src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                  __global   uchar *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                  __global   ushort *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                  __global   short *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                  __global   int *dst,  int dst_step,  int dst_offset,
-                                  int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                  __global   float *dst,  int dst_step,  int dst_offset,
-                                  float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                  __global   double *dst,  int dst_step,  int dst_offset,
-                                  double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@ -51,561 +51,28 @@
 #endif
 #endif

-#ifdef ARITHM_ADD
-  #define ARITHM_OP(A,B) ((A)+(B))
-#elif defined ARITHM_SUB
-  #define ARITHM_OP(A,B) ((A)-(B))
-#endif
-/**************************************add with scalar with mask**************************************/
-__kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
+///////////////////////////////////////////////////////////////////////////////////
+//////////////////////////// Add with scalar with mask ////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////

+__kernel void arithm_binary_op_scalar_mask(__global T *src1, int src1_step, int src1_offset,
+                                     __global WT *scalar,
+                                     __global uchar *mask, int mask_step, int mask_offset,
+                                     __global T *dst,  int dst_step,  int dst_offset,
+                                     int cols, int rows)
+{
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int mask_index_fix = mask_index < 0 ? 0 : mask_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        int4 src2_data = (int4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index_fix);
-        if(src1_index < 0)
+        int mask_index = mad24(y, mask_step, x + mask_offset);
+        if (mask[mask_index])
        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int dst_index = mad24(y, dst_step, dst_offset + x);
+
+            dst[dst_index] = convertToT(convertToWT(src1[src1_index]) Operation scalar[0]);
        }
-        if(mask_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (mask_index == -2) ? mask_data.zwxy:mask_data.yzwx;
-            mask_data.xyzw = (mask_index == -1) ? mask_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
    }
 }
-__kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar  *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        ushort2 tmp_data = convert_ushort2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        int2 src2_data = (int2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        int2    tmp = ARITHM_OP(convert_int2_sat(src1_data), src2_data);
-        short2 tmp_data = convert_short2_sat(tmp);
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C1_D4 (__global   int   *src1, int src1_step, int src1_offset,
-                                            __global   int   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = convert_int_sat(ARITHM_OP((long)src_data1, (long)src_data2));
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_add_with_mask_C1_D5 (__global   float   *src1, int src1_step, int src1_offset,
-                                            __global   float   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float src_data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float src_data2 = src2.x;
-        float dst_data  = *((__global float *)((__global char *)dst  + dst_index));
-
-        float data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C1_D6 (__global   double   *src1, int src1_step, int src1_offset,
-                                            __global   double   *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double src_data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double src_data2 = src2.x;
-        double dst_data  = *((__global double *)((__global char *)dst  + dst_index));
-
-        double data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        int4   src2_data = (int4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        int4  tmp = ARITHM_OP(convert_int4_sat(src1_data), src2_data);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        ushort2 data = convert_ushort2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        int2    tmp = ARITHM_OP(convert_int2_sat(src_data1), src_data2);
-        short2 data = convert_short2_sat(tmp);
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = convert_int2_sat(ARITHM_OP(convert_long2_sat(src_data1), convert_long2_sat(src_data2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float2 src_data1 = *((__global float2 *)((__global char *)src1 + src1_index));
-        float2 src_data2 = (float2)(src2.x, src2.y);
-        float2 dst_data  = *((__global float2 *)((__global char *)dst  + dst_index));
-
-        float2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C2_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double2 src_data1 = *((__global double2 *)((__global char *)src1 + src1_index));
-        double2 src_data2 = (double2)(src2.x, src2.y);
-        double2 dst_data  = *((__global double2 *)((__global char *)dst  + dst_index));
-
-        double2 data = ARITHM_OP(src_data1, src_data2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-
-__kernel void arithm_s_add_with_mask_C4_D0 (__global   uchar *src1, int src1_step, int src1_offset,
-                                            __global   uchar *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = convert_uchar4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D2 (__global   ushort *src1, int src1_step, int src1_offset,
-                                            __global   ushort *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = convert_ushort4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D3 (__global   short *src1, int src1_step, int src1_offset,
-                                            __global   short *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = convert_short4_sat(ARITHM_OP(convert_int4_sat(src_data1), src2));
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D4 (__global   int *src1, int src1_step, int src1_offset,
-                                            __global   int *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = convert_int4_sat(ARITHM_OP(convert_long4_sat(src_data1), convert_long4_sat(src2)));
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_add_with_mask_C4_D5 (__global   float *src1, int src1_step, int src1_offset,
-                                            __global   float *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            float4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        float4 src_data1 = *((__global float4 *)((__global char *)src1 + src1_index));
-        float4 dst_data  = *((__global float4 *)((__global char *)dst  + dst_index));
-
-        float4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C4_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                            __global   double *dst,  int dst_step,  int dst_offset,
-                                            __global   uchar *mask, int mask_step, int mask_offset,
-                                            double4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        double4 src_data1 = *((__global double4 *)((__global char *)src1 + src1_index));
-        double4 dst_data  = *((__global double4 *)((__global char *)dst  + dst_index));
-
-        double4 data = ARITHM_OP(src_data1, src2);
-        data = mask_data ? data : dst_data;
-
-        *((__global double4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary.cl
@ -43,303 +43,25 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-//bitwise_binary without mask for and, or, xor operators

 /////////////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////bitwise_binary///////////////////////////////////////////
+/////////////////////////////////////////// bitwise_binary //////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////////

-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
-__kernel void arithm_bitwise_binary_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                                     __global uchar *src2, int src2_step, int src2_offset,
-                                     __global uchar *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
+__kernel void arithm_bitwise_binary(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
    {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, x + src1_offset);
+        int src2_index = mad24(y, src2_step, x + src2_offset);
+        int dst_index = mad24(y, dst_step, dst_offset + x);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            uchar4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
    }
 }
-
-
-__kernel void arithm_bitwise_binary_D1 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        char4 src1_data = vload4(0, src1 + src1_index_fix);
-        char4 src2_data = vload4(0, src2 + src2_index_fix);
-
-        if(src1_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            char4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        char4 dst_data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global char4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                                     __global ushort *src2, int src2_step, int src2_offset,
-                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            ushort4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        ushort4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D3 (__global short *src1, int src1_step, int src1_offset,
-                                     __global short *src2, int src2_step, int src2_offset,
-                                     __global short *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-
-        if(src1_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-        }
-        if(src2_index < 0)
-        {
-            short4 tmp;
-            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-        }
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_D4 (__global int *src1, int src1_step, int src1_offset,
-                                     __global int *src2, int src2_step, int src2_offset,
-                                     __global int *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int tmp  = data1 OP_BINARY data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-__kernel void arithm_bitwise_binary_D5 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 tmp = data1 OP_BINARY data2;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_D6 (__global char *src1, int src1_step, int src1_offset,
-                                     __global char *src2, int src2_step, int src2_offset,
-                                     __global char *dst,  int dst_step,  int dst_offset,
-                                     int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data1 OP_BINARY data2;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_mask.cl
@ -43,767 +43,31 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with mask**************************************/
-__kernel void arithm_bitwise_binary_with_mask_C1_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{

+__kernel void arithm_bitwise_binary_mask(__global uchar * src1, int src1_step, int src1_offset,
+                                    __global uchar * src2, int src2_step, int src2_offset,
+                                    __global uchar * mask, int mask_step, int mask_offset, int elemSize,
+                                    __global uchar * dst, int dst_step, int dst_offset,
+                                    int cols1, int rows)
+{
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
    {
-        x = x << 2;
+        int mask_index = mad24(y, mask_step, mask_offset + (x / elemSize));

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = mad24(y, src2_step, x + src2_offset);
+            int dst_index = mad24(y, dst_step, x + dst_offset);

-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
    }
 }
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_char((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_char((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = convert_char((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = convert_char((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = vload2(0, (__global ushort *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_ushort((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_ushort((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)((__global char *)src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C1_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-
-}
-
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = vload4(0, src2 + src2_index);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = *((__global ushort2 *)((__global char *)src2 + src2_index));
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = *((__global short2 *)((__global char *)src2 + src2_index));
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int    *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = *((__global int2 *)((__global char *)src2 + src2_index));
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C2_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src_data2 = *((__global char8 *)((__global char *)src2 + src2_index));
-        char8 dst_data  = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C2_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D0 (
-        __global uchar *src1, int src1_step, int src1_offset,
-        __global uchar *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global uchar *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 src_data2 = *((__global uchar4 *)(src2 + src2_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D1 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 src_data2 = *((__global char4 *)(src2 + src2_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_bitwise_binary_with_mask_C4_D2 (
-        __global ushort *src1, int src1_step, int src1_offset,
-        __global ushort *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global ushort *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 src_data2 = *((__global ushort4 *)((__global char *)src2 + src2_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D3 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src_data2 = *((__global short4 *)((__global char *)src2 + src2_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D4 (
-        __global int   *src1, int src1_step, int src1_offset,
-        __global int   *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global int   *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 src_data2 = *((__global int4 *)((__global char *)src2 + src2_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_bitwise_binary_with_mask_C4_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 4) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src_data2 = *((__global char16 *)((__global char *)src2 + src2_index));
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_binary_with_mask_C4_D6 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *src2, int src2_step, int src2_offset,
-        __global uchar  *mask, int mask_step, int mask_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 5) + src2_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src_data1_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0));
-        char8 src_data1_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8));
-        char8 src_data1_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-        char8 src_data1_3 = *((__global char8 *)((__global char *)src1 + src1_index + 24));
-
-        char8 src_data2_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0));
-        char8 src_data2_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8));
-        char8 src_data2_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
-        char8 src_data2_3 = *((__global char8 *)((__global char *)src2 + src2_index + 24));
-
-        char8 dst_data_0  = *((__global char8 *)((__global char *)dst  + dst_index + 0));
-        char8 dst_data_1  = *((__global char8 *)((__global char *)dst  + dst_index + 8));
-        char8 dst_data_2  = *((__global char8 *)((__global char *)dst  + dst_index + 16));
-        char8 dst_data_3  = *((__global char8 *)((__global char *)dst  + dst_index + 24));
-
-        char8 data_0 = src_data1_0 OP_BINARY src_data2_0;
-        char8 data_1 = src_data1_1 OP_BINARY src_data2_1;
-        char8 data_2 = src_data1_2 OP_BINARY src_data2_2;
-        char8 data_3 = src_data1_3 OP_BINARY src_data2_3;
-
-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
-
-        *((__global char8 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global char8 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global char8 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global char8 *)((__global char *)dst + dst_index + 24)) = data_3;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar.cl
@ -43,596 +43,26 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary/////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
-/******************************bitwise binary with scalar without mask********************************/
-__kernel void arithm_s_bitwise_binary_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
+
+__kernel void arithm_bitwise_binary_scalar(
+        __global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
+        __global uchar *dst, int dst_step, int dst_offset,
+        int cols1, int rows)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if (x < cols && y < rows)
+    if (x < cols1 && y < rows)
    {
-        x = x << 2;
+        int src1_index = mad24(y, src1_step, src1_offset + x);
+        int src2_index = x % elemSize;
+        int dst_index  = mad24(y, dst_step, dst_offset + x);

-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
+        dst[dst_index] = src1[src1_index] Operation src2[src2_index];
    }
 }
-
-
-__kernel void arithm_s_bitwise_binary_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x;
-        data.y = (dst_index + 2 <  dst_end  ) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-
-        int data = src_data1 OP_BINARY src_data2;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C1_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        char4 data  = *((__global char4 *)((__global char *)dst  + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-
-        short4 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy;
-        data.zw = (dst_index + 2 <  dst_end  ) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-
-        short2 data = src_data1 OP_BINARY src_data2;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        char8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-
-        short8 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-
-__kernel void arithm_s_bitwise_binary_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-
-        char16 tmp_data = src1_data OP_BINARY src2_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 tmp_data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 tmp_data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 tmp_data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 tmp_data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
-
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_binary_scalar_mask.cl
@ -42,6 +42,7 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
@ -50,698 +51,29 @@
 #endif
 #endif

-#ifndef OP_BINARY
-#define OP_BINARY &
-#endif
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////bitwise_binary////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************bitwise_binary with scalar with mask**************************************/
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{

-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x);
-        uchar4 mask_data = vload4(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y;
-        data.z = ((mask_data.z) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z;
-        data.w = ((mask_data.w) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar  *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort2 src2_data = (ushort2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index));
-        ushort2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index));
-        short2 src2_data = (short2)(src2.x, src2.x);
-        uchar2  mask_data = vload2(0, mask + mask_index);
-
-        short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
-        short2 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.x = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.x : data.x;
-        data.y = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.y : data.y;
-
-        *((__global short2 *)((__global uchar *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D4 (
-        __global   int   *src1, int src1_step, int src1_offset,
-        __global   int   *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int src_data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int src_data2 = src2.x;
-        int dst_data  = *((__global int *)((__global char *)dst  + dst_index));
-
-        int data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D5 (
-        __global char *src1, int src1_step, int src1_offset,
-        __global char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index));
-        char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        char4 dst_data  = *((__global char4 *)((__global char *)dst  + dst_index));
-
-        char4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C1_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        uchar4 data = *((__global uchar4 *)(dst + dst_index));
-        uchar4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset / 2) & 1)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc);
-
-        char4 src1_data = vload4(0, src1 + src1_index);
-        char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y);
-        uchar2 mask_data = vload2(0, mask + mask_index);
-
-        char4 data = *((__global char4 *)(dst + dst_index));
-        char4 tmp_data = src1_data OP_BINARY src2_data;
-
-        data.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data.xy : data.xy;
-        data.zw = ((mask_data.y) && (dst_index + 2 <  dst_end  )) ? tmp_data.zw : data.zw;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index));
-        ushort2 src_data2 = (ushort2)(src2.x, src2.y);
-        ushort2 dst_data  = *((__global ushort2 *)((__global char *)dst  + dst_index));
-
-        ushort2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index));
-        short2 src_data2 = (short2)(src2.x, src2.y);
-        short2 dst_data  = *((__global short2 *)((__global char *)dst  + dst_index));
-
-        short2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index));
-        int2 src_data2 = (int2)(src2.x, src2.y);
-        int2 dst_data  = *((__global int2 *)((__global char *)dst  + dst_index));
-
-        int2 data = src_data1 OP_BINARY src_data2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int2 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global  char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index));
-        char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        char8 dst_data = *((__global char8 *)((__global char *)dst  + dst_index));
-
-        char8 data = src1_data OP_BINARY src2_data;
-
-        data = mask_data ? data : dst_data;
-
-        *((__global char8 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C2_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
+__kernel void arithm_bitwise_binary_scalar_mask(__global uchar *src1, int src1_step, int src1_offset,
+        __global uchar *src2, int elemSize,
        __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
+        __global uchar *dst,  int dst_step,  int dst_offset,
+        int cols, int rows)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (x < cols && y < rows)
    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
+        int mask_index = mad24(y, mask_step, (x / elemSize) + mask_offset);
+        if (mask[mask_index])
+        {
+            int src1_index = mad24(y, src1_step, x + src1_offset);
+            int src2_index = x % elemSize;
+            int dst_index = mad24(y, dst_step, x + dst_offset);

-        uchar mask_data = *(mask + mask_index);
-
-        short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index));
-        short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
-        short8 dst_data = *((__global short8 *)((__global char *)dst  + dst_index));
-
-        short8 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global short8 *)((__global char *)dst + dst_index)) = data;
+            dst[dst_index] = src1[src1_index] Operation src2[src2_index];
+        }
    }
 }
-#endif
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D0 (
-        __global   uchar *src1, int src1_step, int src1_offset,
-        __global   uchar *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        uchar4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index));
-        uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
-
-        uchar4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global uchar4 *)(dst + dst_index)) = data;
-    }
-}
-
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D1 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char4 src_data1 = *((__global char4 *)(src1 + src1_index));
-        char4 dst_data  = *((__global char4 *)(dst  + dst_index));
-
-        char4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global char4 *)(dst + dst_index)) = data;
-    }
-}
-
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D2 (
-        __global   ushort *src1, int src1_step, int src1_offset,
-        __global   ushort *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        ushort4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index));
-        ushort4 dst_data  = *((__global ushort4 *)((__global char *)dst  + dst_index));
-
-        ushort4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D3 (
-        __global   short *src1, int src1_step, int src1_offset,
-        __global   short *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        short4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index));
-        short4 dst_data  = *((__global short4 *)((__global char *)dst  + dst_index));
-
-        short4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D4 (
-        __global   int *src1, int src1_step, int src1_offset,
-        __global   int *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        int4 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index));
-        int4 dst_data  = *((__global int4 *)((__global char *)dst  + dst_index));
-
-        int4 data = src_data1 OP_BINARY src2;
-        data = mask_data ? data : dst_data;
-
-        *((__global int4 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D5 (
-        __global   char *src1, int src1_step, int src1_offset,
-        __global   char *dst,  int dst_step,  int dst_offset,
-        __global   uchar *mask, int mask_step, int mask_offset,
-        char16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 4) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 4) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index));
-        char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7,
-                                    src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf);
-        char16 dst_data  = *((__global char16 *)((__global char *)dst  + dst_index));
-
-        char16 data = src1_data OP_BINARY src2_data;
-        data = mask_data ? data : dst_data;
-
-        *((__global char16 *)((__global char *)dst + dst_index)) = data;
-    }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_binary_with_mask_C4_D6 (
-        __global short *src1, int src1_step, int src1_offset,
-        __global short *dst,  int dst_step,  int dst_offset,
-        __global uchar *mask, int mask_step, int mask_offset,
-        short16 src2, int rows, int cols, int dst_step1)
-{
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 5) + src1_offset);
-        int mask_index = mad24(y, mask_step,  x       + mask_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 5) + dst_offset);
-
-        uchar mask_data = *(mask + mask_index);
-
-        short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0));
-        short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8));
-        short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-        short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24));
-
-        short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
-        short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
-        short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-        short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf);
-
-        short4 dst_data_0  = *((__global short4 *)((__global char *)dst  + dst_index + 0));
-        short4 dst_data_1  = *((__global short4 *)((__global char *)dst  + dst_index + 8));
-        short4 dst_data_2  = *((__global short4 *)((__global char *)dst  + dst_index + 16));
-        short4 dst_data_3  = *((__global short4 *)((__global char *)dst  + dst_index + 24));
-
-        short4 data_0 = src1_data_0 OP_BINARY src2_data_0;
-        short4 data_1 = src1_data_1 OP_BINARY src2_data_1;
-        short4 data_2 = src1_data_2 OP_BINARY src2_data_2;
-        short4 data_3 = src1_data_3 OP_BINARY src2_data_3;
-
-        data_0 = mask_data ? data_0 : dst_data_0;
-        data_1 = mask_data ? data_1 : dst_data_1;
-        data_2 = mask_data ? data_2 : dst_data_2;
-        data_3 = mask_data ? data_3 : dst_data_3;
-
-        *((__global short4 *)((__global char *)dst + dst_index + 0)) = data_0;
-        *((__global short4 *)((__global char *)dst + dst_index + 8)) = data_1;
-        *((__global short4 *)((__global char *)dst + dst_index + 16)) = data_2;
-        *((__global short4 *)((__global char *)dst + dst_index + 24)) = data_3;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_compare.cl
+++ b/modules/ocl/src/opencl/arithm_compare.cl
@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void arithm_compare(__global T * src1, int src1_step1, int src1_offset1,
+                              __global T * src2, int src2_step1, int src2_offset1,
+                              __global uchar * dst, int dst_step1, int dst_offset1,
+                              int cols1, int rows)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols1 && y < rows)
+    {
+        int src1_index = mad24(y, src1_step1, x + src1_offset1);
+        int src2_index = mad24(y, src2_step1, x + src2_offset1);
+        int dst_index = mad24(y, dst_step1, x + dst_offset1);
+
+        dst[dst_index] = convert_uchar(src1[src1_index] Operation src2[src2_index] ? 255 : 0);
+    }
+}
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
--- a/modules/ocl/src/opencl/arithm_div.cl
+++ b/modules/ocl/src/opencl/arithm_div.cl
@ -1,468 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-typedef double F ;
-typedef double4 F4;
-#define convert_F4 convert_double4
-#define convert_F  double
-#else
-typedef float F;
-typedef float4 F4;
-#define convert_F4 convert_float4
-#define convert_F  float
-#endif
-
-inline uchar round2_uchar(F v)
-{
-    return convert_uchar_sat(round(v));
-}
-
-inline ushort round2_ushort(F v)
-{
-    return convert_ushort_sat(round(v));
-}
-
-inline short round2_short(F v)
-{
-    return convert_short_sat(round(v));
-}
-
-inline int round2_int(F v)
-{
-    return convert_int_sat(round(v));
-}
-///////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////divide///////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////////////////////////////
-/**********************************div*********************************************/
-__kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int2 coor = (int2)(get_global_id(0), get_global_id(1));
-
-    if (coor.x < cols && coor.y < rows)
-    {
-        coor.x = coor.x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
-                                mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
-
-        int4 dst_args  = (int4)(mad24(coor.y, dst_step, dst_offset),
-                                mad24(coor.y, dst_step, dst_offset + dst_step1),
-                                mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
-                                0);
-
-        uchar4 src1_data = vload4(0, src1 + src_index.x);
-        uchar4 src2_data = vload4(0, src2 + src_index.y);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_args.z));
-
-        F4 tmp      = convert_F4(src1_data) * scalar;
-        uchar4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
-
-        dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_args.z)) = dst_data;
-    }
-}
-
-__kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-
-        F4 tmp   = convert_F4(src1_data) * scalar;
-
-        ushort4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-
-        F4 tmp   = convert_F4(src1_data) * scalar;
-
-        short4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w);
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-
-        F tmp  = (convert_F)(data1) * scalar;
-        int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2));
-
-        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
-    }
-}
-
-__kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-
-        F tmp  = (convert_F)(data1) * scalar;
-        float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2));
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        double tmp  = data1 * scalar;
-        double tmp_data = (tmp == 0 || data2 == 0) ? 0 : (tmp / data2);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
-/************************************div with scalar************************************/
-__kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset,
-                               __global uchar *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src_index = mad24(y, src_step, x + src_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src_data = vload4(0, src + src_index);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
-
-        uchar4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset,
-                               __global ushort *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src_data = vload4(0, (__global ushort *)((__global char *)src + src_index));
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-
-        ushort4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset,
-                               __global short *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src_data = vload4(0, (__global short *)((__global char *)src + src_index));
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-
-        short4 tmp_data;
-        tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x);
-        tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y);
-        tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z);
-        tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w);
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
-                               __global int *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 2) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data = *((__global int *)((__global char *)src + src_index));
-
-        int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data));
-
-        *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
-    }
-}
-
-__kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset,
-                               __global float *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, F scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 2) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data = *((__global float *)((__global char *)src + src_index));
-
-        float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data));
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset,
-                               __global double *dst,  int dst_step,  int dst_offset,
-                               int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src_index = mad24(y, src_step, (x << 3) + src_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data = *((__global double *)((__global char *)src + src_index));
-
-        double tmp_data = (scalar == 0 || data == 0) ? 0 : (scalar / data);
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
-    }
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_exp.cl
+++ b/modules/ocl/src/opencl/arithm_exp.cl
@ -42,52 +42,70 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif

+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif

 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////EXP//////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////

-__kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+__kernel void arithm_exp_C1(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < cols && y < rows)
+    if(x < cols1 && y < rows)
    {
-      x = x << 2;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
-
-      float src_data = *((__global float *)((__global char *)src + srcIdx));
-      float dst_data = exp(src_data);
-
-      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x + dstOffset1);

+        dst[dstIdx] = exp(src[srcIdx]);
    }
 }

-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+__kernel void arithm_exp_C2(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  if(x < cols && y < rows )
-  {
-      x = x << 3;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
+    int x1 = get_global_id(0) << 1;
+    int y = get_global_id(1);

-      double src_data = *((__global double *)((__global char *)src + srcIdx));
-      double dst_data = exp(src_data);
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);

-      *((__global double *)((__global char *)dst + dstIdx )) = dst_data;
-     // dst[dstIdx] = exp(src[srcIdx]);
-  }
+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
+    }
 }

-#endif
+__kernel void arithm_exp_C4(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
+
+        dst[dstIdx] =                      exp(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? exp(src[srcIdx + 1]) : dst[dstIdx + 1];
+        dst[dstIdx + 2] = x1 + 2 < cols1 ? exp(src[srcIdx + 2]) : dst[dstIdx + 2];
+        dst[dstIdx + 3] = x1 + 3 < cols1 ? exp(src[srcIdx + 3]) : dst[dstIdx + 3];
+    }
+}
--- a/modules/ocl/src/opencl/arithm_log.cl
+++ b/modules/ocl/src/opencl/arithm_log.cl
@ -1,4 +1,3 @@
-
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
@ -43,52 +42,66 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
+
 #if defined (DOUBLE_SUPPORT)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif

-#define INF_FLOAT -88.029694
-#define INF_DOUBLE -709.0895657128241
-
-
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////LOG/////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////

-__kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global float *src, __global float *dst)
+__kernel void arithm_log_C1(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

-    if(x < cols && y < rows )
+    if(x < cols1 && y < rows)
    {
-      x = x << 2;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
+        int srcIdx = mad24(y, srcStep1, x + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x + dstOffset1);

-      float src_data = *((__global float *)((__global char *)src + srcIdx));
-      float dst_data = (src_data == 0) ? INF_FLOAT : log(fabs(src_data));
-
-      *((__global float *)((__global char *)dst + dstIdx)) = dst_data;
+        dst[dstIdx] = log(src[srcIdx]);
    }
 }

-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
+__kernel void arithm_log_C2(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
 {
-    int x = get_global_id(0);
+    int x1 = get_global_id(0) << 1;
    int y = get_global_id(1);

-    if(x < cols && y < rows )
+    if(x1 < cols1 && y < rows)
    {
-      x = x << 3;
-      int srcIdx = mad24( y, srcStep, x + srcOffset);
-      int dstIdx = mad24( y, dstStep, x + dstOffset);
-
-      double src_data = *((__global double *)((__global char *)src + srcIdx));
-      double dst_data = (src_data == 0) ? INF_DOUBLE : log(fabs(src_data));
-      *((__global double *)((__global char *)dst + dstIdx)) = dst_data;
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);

+        dst[dstIdx] =                      log(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
+    }
+}
+
+__kernel void arithm_log_C4(__global srcT *src, __global srcT *dst,
+    int cols1, int rows,
+    int srcOffset1, int dstOffset1,
+    int srcStep1, int dstStep1)
+{
+    int x1 = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if(x1 < cols1 && y < rows)
+    {
+        int srcIdx = mad24(y, srcStep1, x1 + srcOffset1);
+        int dstIdx = mad24(y, dstStep1, x1 + dstOffset1);
+
+        dst[dstIdx] =                      log(src[srcIdx]);
+        dst[dstIdx + 1] = x1 + 1 < cols1 ? log(src[srcIdx + 1]) : dst[dstIdx + 1];
+        dst[dstIdx + 2] = x1 + 2 < cols1 ? log(src[srcIdx + 2]) : dst[dstIdx + 2];
+        dst[dstIdx + 3] = x1 + 3 < cols1 ? log(src[srcIdx + 3]) : dst[dstIdx + 3];
    }
 }
-#endif
--- a/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
+++ b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
@ -1,177 +0,0 @@
-
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other oclMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this softwareif advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////magnitudeSqr//////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_offset,
-                           __global float *src2, int src2_step,int src2_offset,
-                           __global float *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-
-    {
-
-        x = x << 2;
-
-        #define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
-    if(src2_index < 0)
-    {
-        float4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-
-        float4   tmp_data  ;
-      tmp_data.x = src1_data.x * src1_data.x + src2_data.x * src2_data.x;
-
-      tmp_data.y = src1_data.y * src1_data.y + src2_data.y * src2_data.y;
-
-      tmp_data.z = src1_data.z * src1_data.z + src2_data.z * src2_data.z;
-
-      tmp_data.w = src1_data.w * src1_data.w + src2_data.w * src2_data.w;
-
-
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-
-
-#if defined (DOUBLE_SUPPORT)
-
-__kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_offset,
-                           __global float *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-
-
-    {
-
-        x = x << 2;
-
-        #define dst_align ((dst_offset >> 2) & 3)
-
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-
-        float8 src1_data = vload8(0, (__global float  *)((__global char *)src1 + src1_index_fix));
-
-    if(src1_index==-6)
-          src1_data.s01234567 = src1_data.s67012345;
-    if(src1_index==-4)
-          src1_data.s01234567 = src1_data.s45670123;
-    if(src1_index== -2)
-          src1_data.s01234567 = src1_data.s23456701;
-
-
-
-        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-
-        float4   tmp_data  ;
-      tmp_data.x = src1_data.s0 * src1_data.s0 + src1_data.s1 * src1_data.s1;
-
-      tmp_data.y = src1_data.s2 * src1_data.s2 + src1_data.s3 * src1_data.s3;
-
-      tmp_data.z = src1_data.s4 * src1_data.s4 + src1_data.s5 * src1_data.s5;
-
-      tmp_data.w = src1_data.s6 * src1_data.s6 + src1_data.s7 * src1_data.s7;
-
-
-
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 8 >= dst_start) && (dst_index + 8 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 12 >= dst_start) && (dst_index + 12 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global float4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-
-}
-#endif
--- a/modules/ocl/src/opencl/arithm_minMax.cl
+++ b/modules/ocl/src/opencl/arithm_minMax.cl
@ -44,9 +44,14 @@
 //M*/

 /**************************************PUBLICFUNC*************************************/
+
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
 #endif
+#endif

 #if defined (DEPTH_0)
 #define VEC_TYPE uchar8
--- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
@ -142,29 +142,35 @@
 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable

 /**************************************Array minMax**************************************/
-__kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+
+__kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
                                  __global VEC_TYPE *src, __global RES_TYPE *dst)
 {
   unsigned int lid = get_local_id(0);
   unsigned int gid = get_group_id(0);
   unsigned int  id = get_global_id(0);
   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
+
+   __local VEC_TYPE localmem_max[128], localmem_min[128];
+   VEC_TYPE minval, maxval, temp;
+
+   __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
+   VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
+
   int idx_c;
-   if(id < elemnum)
+
+   if (id < elemnum)
   {
       temp = src[idx];
       idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
+
+       if (id % cols == 0 )
       {
           repeat_s(temp);
           repeat_s(temploc);
       }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
       {
           repeat_e(temp);
           repeat_e(temploc);
@ -181,31 +187,33 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
       minloc = negative;
       maxloc = negative;
   }
-   float4 aaa;
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+
+   int grainSize = (groupnum << 8);
+   for (id = id + grainSize; id < elemnum; id = id + grainSize)
   {
       idx = offset + id + (id / cols) * invalid_cols;
       temp = src[idx];
       idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
+
+       if (id % cols == 0 )
       {
               repeat_s(temp);
               repeat_s(temploc);
       }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
       {
               repeat_e(temp);
               repeat_e(temploc);
       }
-       minval = min(minval,temp);
-       maxval = max(maxval,temp);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-       aaa= convert_float4(maxval == temp);
-       maxloc = convert_int4(aaa) ? temploc : maxloc;
+
+       minval = min(minval, temp);
+       maxval = max(maxval, temp);
+       minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
+       maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
   }
-   if(lid > 127)
+
+   if (lid > 127)
   {
       localmem_min[lid - 128] = minval;
       localmem_max[lid - 128] = maxval;
@ -213,29 +221,30 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
       localmem_maxloc[lid - 128] = maxloc;
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
+
+   if (lid < 128)
   {
       localmem_min[lid] = min(minval,localmem_min[lid]);
       localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
+       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
+       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
   }
   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
   {
-       if(lid < lsize)
+       if (lid < lsize)
       {
           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+           localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
+           localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
       }
       barrier(CLK_LOCAL_MEM_FENCE);
   }
-   if( lid == 0)
+
+   if ( lid == 0)
   {
       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
@ -243,138 +252,3 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
   }
 }
-
-#if defined (REPEAT_S0)
-#define repeat_ms(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_ms(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_me(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_me(a) a.s3 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-
-/**************************************Array minMaxLoc mask**************************************/
-/*
-__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global VEC_TYPE *src,
-                                        int minvalid_cols,int moffset,__global uchar4 *mask,__global RES_TYPE  *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp,max_val = MAX_VAL,min_val = MIN_VAL,zero = 0,m_temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
-   if(id < elemnum)
-   {
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = m_temp > zero ? temp : max_val;
-       maxval = m_temp > zero ? temp : min_val;
-       minloc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       maxloc = minloc;
-   }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-       minloc = negative;
-       maxloc = negative;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       midx = moffset + id + (id / cols) * minvalid_cols;
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = min(minval,m_temp > zero ? temp : max_val);
-       maxval = max(maxval,m_temp > zero ? temp : min_val);
-
-       temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-   }
-   if(lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-       localmem_minloc[lid - 128] = minloc;
-       localmem_maxloc[lid - 128] = maxloc;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
-       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
-       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
-       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
-   }
-}
-
-*/
--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ b/modules/ocl/src/opencl/arithm_mul.cl
@ -1,303 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// @Authors
-//    Jia Haipeng, jiahaipeng95@gmail.com
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors as is and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if defined (DOUBLE_SUPPORT)
-#ifdef cl_khr_fp64
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#elif defined (cl_amd_fp64)
-#pragma OPENCL EXTENSION cl_amd_fp64:enable
-#endif
-#endif
-
-int4 round_int4(float4 v)
-{
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
-
-    return convert_int4_sat(v);
-}
-uint4 round_uint4(float4 v)
-{
-    v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
-    v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
-    v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
-    v.s3 = v.s3 + (v.s3 > 0 ? 0.5 : -0.5);
-
-    return convert_uint4_sat(v);
-}
-long round_int(float v)
-{
-    v = v + (v > 0 ? 0.5 : -0.5);
-
-    return convert_int_sat(v);
-}
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////////////multiply//////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////////////////
-/**************************************add without mask**************************************/
-__kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
-        uchar4 src1_data ,src2_data;
-
-        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
-
-        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
-
-        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
-        int4 tmp      = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
-        tmp = round_int4(convert_float4(tmp) * scalar);
-        uchar4 tmp_data = convert_uchar4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
-
-        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-        uint4    tmp = convert_uint4_sat(src1_data) * convert_uint4_sat(src2_data);
-        tmp = round_uint4(convert_float4(tmp) * scalar);
-        ushort4 tmp_data = convert_ushort4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-__kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
-
-        int dst_start  = mad24(y, dst_step, dst_offset);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
-        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-
-        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
-
-        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-        int4   tmp = convert_int4_sat(src1_data) * convert_int4_sat(src2_data);
-        tmp = round_int4(convert_float4(tmp) * scalar);
-        short4 tmp_data = convert_short4_sat(tmp);
-
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w;
-
-        *((__global short4 *)((__global char *)dst + dst_index)) = dst_data;
-    }
-}
-
-__kernel void arithm_mul_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        int data1 = *((__global int *)((__global char *)src1 + src1_index));
-        int data2 = *((__global int *)((__global char *)src2 + src2_index));
-        int tmp  = data1 * data2;
-        tmp = round_int((float)tmp * scalar);
-
-        *((__global int *)((__global char *)dst + dst_index)) = convert_int_sat(tmp);
-    }
-}
-__kernel void arithm_mul_D5 (__global float *src1, int src1_step, int src1_offset,
-                             __global float *src2, int src2_step, int src2_offset,
-                             __global float *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, float scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float data2 = *((__global float *)((__global char *)src2 + src2_index));
-        float tmp = data1 * data2;
-        tmp = tmp * scalar;
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offset,
-                             __global double *src2, int src2_step, int src2_offset,
-                             __global double *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1, double scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset);
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 3) + dst_offset);
-
-        double data1 = *((__global double *)((__global char *)src1 + src1_index));
-        double data2 = *((__global double *)((__global char *)src2 + src2_index));
-
-        double tmp = data1 * data2;
-        tmp = tmp * scalar;
-
-        *((__global double *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
-#endif
-
-#ifdef DOUBLE_SUPPORT
-#define SCALAR_TYPE double
-#else
-#define SCALAR_TYPE float
-#endif
-
-__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
-                              __global float *dst,  int dst_step,  int dst_offset,
-                              int rows, int cols, int dst_step1, SCALAR_TYPE scalar)
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-
-    if (x < cols && y < rows)
-    {
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
-        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
-
-        float data1 = *((__global float *)((__global char *)src1 + src1_index));
-        float tmp = data1 * scalar;
-
-        *((__global float *)((__global char *)dst + dst_index)) = tmp;
-    }
-}
--- a/modules/ocl/src/opencl/arithm_transpose.cl
+++ b/modules/ocl/src/opencl/arithm_transpose.cl
@ -43,18 +43,23 @@
 //
 //M*/

+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
 #define TILE_DIM      32
 #define BLOCK_ROWS    8
-#define LDS_STEP     (TILE_DIM + 1)
+#define LDS_STEP      TILE_DIM

-
-//8UC1 is not unoptimized, as the size of write per thread is 8
-//which will use completepath
-__kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
-                              __global uchar* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
+__kernel void transpose(__global const T* src, __global T* dst,
+    int src_cols, int src_rows,
+    int src_step, int dst_step,
+    int src_offset, int dst_offset)
 {
-
    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);

@ -81,430 +86,54 @@ __kernel void transpose_C1_D0(__global uchar* src, int src_step, int src_offset,
    int x_index = groupId_y * TILE_DIM + lx;
    int y_index = groupId_x * TILE_DIM + ly;

-    __local uchar title[TILE_DIM * LDS_STEP];
+    __local T title[TILE_DIM * LDS_STEP];

-    if(x < src_cols && y < src_rows)
+    if (x < src_cols && y < src_rows)
    {
        int index_src = mad24(y, src_step, x);

-        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
-            if(y + i < src_rows)
+            if (y + i < src_rows)
            {
-                title[(ly + i) * LDS_STEP + lx] =*(src + src_offset + index_src);
+                title[(ly + i) * LDS_STEP + lx] = src[src_offset + index_src];
                index_src = mad24(BLOCK_ROWS, src_step, index_src);
            }
        }
-     }
+    }

-     barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);

-    if(x_index < src_rows && y_index < src_cols)
+    if (x_index < src_rows && y_index < src_cols)
    {
        int index_dst = mad24(y_index, dst_step, x_index);

-        #pragma unroll
        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
        {
-            if((y_index + i) < src_cols)
+            if ((y_index + i) < src_cols)
            {
-                *(dst + dst_offset + index_dst ) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
+                dst[dst_offset + index_dst] = title[lx * LDS_STEP + ly + i];
+                index_dst +=  dst_step * BLOCK_ROWS;
            }
        }
    }
 }

-__kernel void transpose_C1_D4(__global int* src, int src_step, int src_offset,
-                              __global int* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
+__kernel void transpose_inplace(__global T* src, __global T* dst,
+    int src_cols, int src_rows,
+    int src_step, int dst_step,
+    int src_offset, int dst_offset)
 {
+    int x = get_global_id(0);
+    int y = get_global_id(1);

-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
+    if (y < src_rows && x < y)
    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
+        int srcIdx = mad24(y, src_step, src_offset + x);
+        int dstIdx = mad24(x, dst_step, dst_offset + y);

-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local int title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global int *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global int*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C1_D5(__global float* src, int src_step, int src_offset,
-                              __global float* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local float title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global float *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global float*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-
-__kernel void transpose_C2_D2(__global ushort* src, int src_step, int src_offset,
-                              __global ushort* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local ushort2 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global ushort2 *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global ushort2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C2_D3(__global short* src, int src_step, int src_offset,
-                              __global short* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local short2 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global short2 *)((__global char*)src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global short2*)((__global char*)dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-__kernel void transpose_C4_D0(__global uchar* src, int src_step, int src_offset,
-                              __global uchar* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local uchar4 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global uchar4 *)(src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global uchar4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
-    }
-}
-
-__kernel void transpose_C4_D1(__global char* src, int src_step, int src_offset,
-                              __global char* dst, int dst_step, int dst_offset,
-                              int src_rows, int src_cols)
-{
-
-    int gp_x = get_group_id(0),   gp_y = get_group_id(1);
-    int gs_x = get_num_groups(0), gs_y = get_num_groups(1);
-
-    int groupId_x, groupId_y;
-
-    if(src_rows == src_cols)
-    {
-        groupId_y = gp_x;
-        groupId_x = (gp_x + gp_y) % gs_x;
-    }
-    else
-    {
-        int bid = gp_x + gs_x * gp_y;
-        groupId_y =  bid % gs_y;
-        groupId_x = ((bid / gs_y) + groupId_y) % gs_x;
-    }
-
-    int lx = get_local_id(0);
-    int ly = get_local_id(1);
-
-    int x = groupId_x * TILE_DIM + lx;
-    int y = groupId_y * TILE_DIM + ly;
-
-    int x_index = groupId_y * TILE_DIM + lx;
-    int y_index = groupId_x * TILE_DIM + ly;
-
-    __local char4 title[TILE_DIM * LDS_STEP];
-
-    if(x < src_cols && y < src_rows)
-    {
-        int index_src = mad24(y, src_step, (x << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if(y + i < src_rows)
-            {
-                title[(ly + i) * LDS_STEP + lx] = *((__global char4 *)(src + src_offset + index_src));
-                index_src = mad24(BLOCK_ROWS, src_step, index_src);
-            }
-        }
-     }
-
-     barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(x_index < src_rows && y_index < src_cols)
-    {
-        int index_dst = mad24(y_index, dst_step, (x_index << 2));
-
-        #pragma unroll
-        for(int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        {
-            if((y_index + i) < src_cols)
-            {
-                *((__global char4*)(dst + dst_offset + index_dst )) = title[lx * LDS_STEP + ly + i];
-                index_dst +=  dst_step * BLOCK_ROWS ;
-            }
-        }
+        T tmp = dst[dstIdx];
+        dst[dstIdx] = src[srcIdx];
+        src[srcIdx] = tmp;
    }
 }
--- a/modules/ocl/test/test_arithm.cpp
+++ b/modules/ocl/test/test_arithm.cpp
--- a/modules/ocl/test/utility.hpp
+++ b/modules/ocl/test/utility.hpp
@ -48,7 +48,7 @@
 #define MHEIGHT 256

 #define MIN_VALUE 171
-#define MAX_VALUE 351
+#define MAX_VALUE 357

 //#define RANDOMROI
 int randomInt(int minVal, int maxVal);