diff --git a/modules/core/doc/operations_on_arrays.rst b/modules/core/doc/operations_on_arrays.rst
index c2121fc6f..3f85eab30 100644
--- a/modules/core/doc/operations_on_arrays.rst
+++ b/modules/core/doc/operations_on_arrays.rst
@@ -2065,7 +2065,7 @@ normalize
 ---------
 Normalizes the norm or value range of an array.
 
-.. ocv:function:: void normalize( InputArray src, OutputArray dst, double alpha=1, double beta=0, int norm_type=NORM_L2, int dtype=-1, InputArray mask=noArray() )
+.. ocv:function:: void normalize( InputArray src, InputOutputArray dst, double alpha=1, double beta=0, int norm_type=NORM_L2, int dtype=-1, InputArray mask=noArray() )
 
 .. ocv:function:: void normalize(const SparseMat& src, SparseMat& dst, double alpha, int normType)
 
diff --git a/modules/core/include/opencv2/core.hpp b/modules/core/include/opencv2/core.hpp
index 9c409482c..ed15594ae 100644
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -240,7 +240,7 @@ CV_EXPORTS_W void batchDistance(InputArray src1, InputArray src2,
                                 bool crosscheck = false);
 
 //! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
-CV_EXPORTS_W void normalize( InputArray src, OutputArray dst, double alpha = 1, double beta = 0,
+CV_EXPORTS_W void normalize( InputArray src, InputOutputArray dst, double alpha = 1, double beta = 0,
                              int norm_type = NORM_L2, int dtype = -1, InputArray mask = noArray());
 
 //! scales and shifts array elements so that either the specified norm (alpha) or the minimum (alpha) and maximum (beta) array values get the specified values
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index d921f7565..d399265b0 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -131,6 +131,7 @@ public:
     virtual bool isSubmatrix(int i=-1) const;
     virtual bool empty() const;
     virtual void copyTo(const _OutputArray& arr) const;
+    virtual void copyTo(const _OutputArray& arr, const _InputArray & mask) const;
     virtual size_t offset(int i=-1) const;
     virtual size_t step(int i=-1) const;
     bool isMat() const;
diff --git a/modules/core/src/convert.cpp b/modules/core/src/convert.cpp
index 01a792cb9..4848489a0 100644
--- a/modules/core/src/convert.cpp
+++ b/modules/core/src/convert.cpp
@@ -1842,18 +1842,86 @@ namespace cv {
 
 #ifdef HAVE_OPENCL
 
-static bool ocl_normalize( InputArray _src, OutputArray _dst, InputArray _mask, int rtype,
-                           double scale, double shift )
+static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
+                           double scale, double delta )
 {
-    UMat src = _src.getUMat(), dst = _dst.getUMat();
+    UMat src = _src.getUMat();
 
     if( _mask.empty() )
-        src.convertTo( dst, rtype, scale, shift );
+        src.convertTo( _dst, dtype, scale, delta );
+    else if (src.channels() <= 4)
+    {
+        const ocl::Device & dev = ocl::Device::getDefault();
+
+        int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
+                ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
+                rowsPerWI = dev.isIntel() ? 4 : 1;
+
+        float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
+        bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
+                haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
+                haveDelta = std::fabs(delta) > DBL_EPSILON,
+                doubleSupport = dev.doubleFPConfig() > 0;
+
+        if (!haveScale && !haveDelta && stype == dtype)
+        {
+            _src.copyTo(_dst, _mask);
+            return true;
+        }
+        if (haveZeroScale)
+        {
+            _dst.setTo(Scalar(delta), _mask);
+            return true;
+        }
+
+        if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
+            return false;
+
+        char cvt[2][40];
+        String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
+                             " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
+                             ocl::typeToStr(stype), ocl::typeToStr(dtype),
+                             ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
+                             rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
+                             ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
+                             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                             haveScale ? " -D HAVE_SCALE" : "",
+                             haveDelta ? " -D HAVE_DELTA" : "",
+                             ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
+
+        ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
+        if (k.empty())
+            return false;
+
+        UMat mask = _mask.getUMat(), dst = _dst.getUMat();
+
+        ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+                maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
+                dstarg = ocl::KernelArg::ReadWrite(dst);
+
+        if (haveScale)
+        {
+            if (haveDelta)
+                k.args(srcarg, maskarg, dstarg, fscale, fdelta);
+            else
+                k.args(srcarg, maskarg, dstarg, fscale);
+        }
+        else
+        {
+            if (haveDelta)
+                k.args(srcarg, maskarg, dstarg, fdelta);
+            else
+                k.args(srcarg, maskarg, dstarg);
+        }
+
+        size_t globalsize[2] = { src.cols, (src.rows + rowsPerWI - 1) / rowsPerWI };
+        return k.run(2, globalsize, NULL, false);
+    }
     else
     {
         UMat temp;
-        src.convertTo( temp, rtype, scale, shift );
-        temp.copyTo( dst, _mask );
+        src.convertTo( temp, dtype, scale, delta );
+        temp.copyTo( _dst, _mask );
     }
 
     return true;
@@ -1863,7 +1931,7 @@ static bool ocl_normalize( InputArray _src, OutputArray _dst, InputArray _mask,
 
 }
 
-void cv::normalize( InputArray _src, OutputArray _dst, double a, double b,
+void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
                     int norm_type, int rtype, InputArray _mask )
 {
     double scale = 1, shift = 0;
diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp
index 0bb62c7df..653efe63a 100644
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2051,6 +2051,23 @@ void _InputArray::copyTo(const _OutputArray& arr) const
         CV_Error(Error::StsNotImplemented, "");
 }
 
+void _InputArray::copyTo(const _OutputArray& arr, const _InputArray & mask) const
+{
+    int k = kind();
+
+    if( k == NONE )
+        arr.release();
+    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    {
+        Mat m = getMat();
+        m.copyTo(arr, mask);
+    }
+    else if( k == UMAT )
+        ((UMat*)obj)->copyTo(arr, mask);
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}
+
 bool _OutputArray::fixedSize() const
 {
     return (flags & FIXED_SIZE) == FIXED_SIZE;
diff --git a/modules/core/src/opencl/normalize.cl b/modules/core/src/opencl/normalize.cl
new file mode 100644
index 000000000..6582e5555
--- /dev/null
+++ b/modules/core/src/opencl/normalize.cl
@@ -0,0 +1,72 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// Copyright (C) 2014, Itseez, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#ifdef DOUBLE_SUPPORT
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+#define noconvert
+
+#if cn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define srcTSIZE (int)sizeof(srcT)
+#define dstTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define srcTSIZE ((int)sizeof(srcT1)*3)
+#define dstTSIZE ((int)sizeof(dstT1)*3)
+#endif
+
+__kernel void normalizek(__global const uchar * srcptr, int src_step, int src_offset,
+                         __global const uchar * mask, int mask_step, int mask_offset,
+                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
+#ifdef HAVE_SCALE
+                         , float scale
+#endif
+#ifdef HAVE_DELTA
+                         , float delta
+#endif
+                         )
+{
+    int x = get_global_id(0);
+    int y0 = get_global_id(1) * rowsPerWI;
+
+    if (x < dst_cols)
+    {
+        int src_index  = mad24(y0, src_step, mad24(x, srcTSIZE, src_offset));
+        int mask_index = mad24(y0, mask_step, x + mask_offset);
+        int dst_index  = mad24(y0, dst_step, mad24(x, dstTSIZE, dst_offset));
+
+        for (int y = y0, y1 = min(y0 + rowsPerWI, dst_rows); y < y1;
+            ++y, src_index += src_step, dst_index += dst_step, mask_index += mask_step)
+        {
+            if (mask[mask_index])
+            {
+                workT value = convertToWT(loadpix(srcptr + src_index));
+#ifdef HAVE_SCALE
+#ifdef HAVE_DELTA
+                value = fma(value, (workT)(scale), (workT)(delta));
+#else
+                value *= (workT)(scale);
+#endif
+#else // not scale
+#ifdef HAVE_DELTA
+                value += (workT)(delta);
+#endif
+#endif
+
+                storepix(convertToDT(value), dstptr + dst_index);
+            }
+        }
+    }
+}