diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index f2acaa3fb..73beb911f 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -442,6 +442,10 @@ template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(
 template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
 
 //! @cond IGNORED
 
@@ -452,6 +456,8 @@ template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_c
 template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
 template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
 template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
 
 template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
@@ -460,6 +466,8 @@ template<> inline schar saturate_cast<schar>(short v)        { return saturate_c
 template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
 template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
 
 template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
 template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
@@ -467,12 +475,16 @@ template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((
 template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
 template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
 template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
 
 template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
 template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
 template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
 template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
 template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
 
 template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
 template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index 2d1087e89..c651b9ee3 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -119,7 +119,7 @@ CV_EXPORTS_W void inpaint( InputArray src, InputArray inpaintMask,
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
 optimizations. Noise expected to be a gaussian white noise
 
-@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
 @param dst Output image with the same size and type as src .
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@@ -138,6 +138,35 @@ parameter.
 CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h = 3,
         int templateWindowSize = 7, int searchWindowSize = 21);
 
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
+                                        const std::vector<float>& h,
+                                        int templateWindowSize = 7, int searchWindowSize = 21,
+                                        int normType = NORM_L2);
+
 /** @brief Modification of fastNlMeansDenoising function for colored images
 
 @param src Input 8-bit 3-channel image.
@@ -165,7 +194,35 @@ captured in small period of time. For example video. This version of the functio
 images or for manual manipulation with colorspaces. For more details see
 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
 
-@param srcImgs Input 8-bit 1-channel, 2-channel or 3-channel images sequence. All images should
+@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
+4-channel images sequence. All images should have the same type and
+size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength. Bigger h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Modification of fastNlMeansDenoising function for images sequence where consequtive images have been
+captured in small period of time. For example video. This version of the function is for grayscale
+images or for manual manipulation with colorspaces. For more details see
+<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
+
+@param srcImgs Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel images sequence. All images should
 have the same type and size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
@@ -178,13 +235,17 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
-removes noise but also removes image details, smaller h value preserves details but also preserves
-some noise
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in dst. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
-        int imgToDenoiseIndex, int temporalWindowSize,
-        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+                                             int imgToDenoiseIndex, int temporalWindowSize,
+                                             const std::vector<float>& h,
+                                             int templateWindowSize = 7, int searchWindowSize = 21,
+                                             int normType = NORM_L2);
 
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
 
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index b4767a738..c68d09b92 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -45,42 +45,115 @@
 #include "fast_nlmeans_multi_denoising_invoker.hpp"
 #include "fast_nlmeans_denoising_opencl.hpp"
 
+template<typename ST, typename IT, typename UIT, typename D>
+static void fastNlMeansDenoising_( const Mat& src, Mat& dst, const std::vector<float>& h,
+                                   int templateWindowSize, int searchWindowSize)
+{
+    int hn = (int)h.size();
+
+    switch (CV_MAT_CN(src.type())) {
+        case 1:
+            parallel_for_(cv::Range(0, src.rows),
+                          FastNlMeansDenoisingInvoker<ST, IT, UIT, D, int>(
+                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case 2:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case 3:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case 4:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                     "Unsupported number of channels! Only 1, 2, 3, and 4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
                                int templateWindowSize, int searchWindowSize)
 {
+    fastNlMeansDenoising(_src, _dst, std::vector<float>(1, h),
+                         templateWindowSize, searchWindowSize);
+}
+
+void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, const std::vector<float>& h,
+                               int templateWindowSize, int searchWindowSize, int normType)
+{
+    int hn = (int)h.size(), type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert(hn == 1 || hn == cn);
+
     Size src_size = _src.size();
     CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize))
+               ocl_fastNlMeansDenoising(_src, _dst, &h[0], hn,
+                                        templateWindowSize, searchWindowSize, normType))
 
     Mat src = _src.getMat();
     _dst.create(src_size, src.type());
     Mat dst = _dst.getMat();
 
+    switch (normType) {
+        case NORM_L2:
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(tegra::useTegra() && tegra::fastNlMeansDenoising(src, dst, h, templateWindowSize, searchWindowSize))
-        return;
+            if(hn == 1 && tegra::useTegra() &&
+               tegra::fastNlMeansDenoising(src, dst, h[0], templateWindowSize, searchWindowSize))
+                return;
 #endif
-
-    switch (src.type()) {
-        case CV_8U:
-            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoising_<uchar, int, unsigned, DistSquared>(src, dst, h,
+                                                                             templateWindowSize,
+                                                                             searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U is supported for NORM_L2");
+            }
             break;
-        case CV_8UC2:
-            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
-            break;
-        case CV_8UC3:
-            parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+        case NORM_L1:
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoising_<uchar, int, unsigned, DistAbs>(src, dst, h,
+                                                                         templateWindowSize,
+                                                                         searchWindowSize);
+                    break;
+                case CV_16U:
+                    fastNlMeansDenoising_<ushort, int64, uint64, DistAbs>(src, dst, h,
+                                                                          templateWindowSize,
+                                                                          searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U and CV_16U are supported for NORM_L1");
+            }
             break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8UC1, CV_8UC2 and CV_8UC3 are supported");
+                     "Unsupported norm type! Only NORM_L2 and NORM_L1 are supported");
     }
 }
 
@@ -92,7 +165,7 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
     Size src_size = _src.size();
     if (type != CV_8UC3 && type != CV_8UC4)
     {
-        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3!");
+        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3 or CV_8UC4!");
         return;
     }
 
@@ -108,8 +181,8 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
     Mat src_lab;
     cvtColor(src, src_lab, COLOR_LBGR2Lab);
 
-    Mat l(src_size, CV_8U);
-    Mat ab(src_size, CV_8UC2);
+    Mat l(src_size, CV_MAKE_TYPE(depth, 1));
+    Mat ab(src_size, CV_MAKE_TYPE(depth, 2));
     Mat l_ab[] = { l, ab };
     int from_to[] = { 0,0, 1,1, 2,2 };
     mixChannels(&src_lab, 1, l_ab, 2, from_to, 3);
@@ -157,9 +230,76 @@ static void fastNlMeansDenoisingMultiCheckPreconditions(
         }
 }
 
+template<typename ST, typename IT, typename UIT, typename D>
+static void fastNlMeansDenoisingMulti_( const std::vector<Mat>& srcImgs, Mat& dst,
+                                        int imgToDenoiseIndex, int temporalWindowSize,
+                                        const std::vector<float>& h,
+                                        int templateWindowSize, int searchWindowSize)
+{
+    int hn = (int)h.size();
+
+    switch (srcImgs[0].type())
+    {
+        case CV_8U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                          FastNlMeansMultiDenoisingInvoker<uchar, IT, UIT, D, int>(
+                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                              dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case CV_8UC2:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case CV_8UC3:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case CV_8UC4:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                     int imgToDenoiseIndex, int temporalWindowSize,
                                     float h, int templateWindowSize, int searchWindowSize)
+{
+    fastNlMeansDenoisingMulti(_srcImgs, _dst, imgToDenoiseIndex, temporalWindowSize,
+                              std::vector<float>(1, h), templateWindowSize, searchWindowSize);
+}
+
+void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
+                                    int imgToDenoiseIndex, int temporalWindowSize,
+                                    const std::vector<float>& h,
+                                    int templateWindowSize, int searchWindowSize, int normType)
 {
     std::vector<Mat> srcImgs;
     _srcImgs.getMatVector(srcImgs);
@@ -168,32 +308,52 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
         srcImgs, imgToDenoiseIndex,
         temporalWindowSize, templateWindowSize, searchWindowSize);
 
+    int hn = (int)h.size();
+    int type = srcImgs[0].type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert(hn == 1 || hn == cn);
+
     _dst.create(srcImgs[0].size(), srcImgs[0].type());
     Mat dst = _dst.getMat();
 
-    switch (srcImgs[0].type())
-    {
-        case CV_8U:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+    switch (normType) {
+        case NORM_L2:
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoisingMulti_<uchar, int, unsigned,
+                                               DistSquared>(srcImgs, dst,
+                                                            imgToDenoiseIndex, temporalWindowSize,
+                                                            h,
+                                                            templateWindowSize, searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U is supported for NORM_L2");
+            }
             break;
-        case CV_8UC2:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
-            break;
-        case CV_8UC3:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+        case NORM_L1:
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoisingMulti_<uchar, int, unsigned,
+                                               DistAbs>(srcImgs, dst,
+                                                        imgToDenoiseIndex, temporalWindowSize,
+                                                        h,
+                                                        templateWindowSize, searchWindowSize);
+                    break;
+                case CV_16U:
+                    fastNlMeansDenoisingMulti_<ushort, int64, uint64,
+                                               DistAbs>(srcImgs, dst,
+                                                        imgToDenoiseIndex, temporalWindowSize,
+                                                        h,
+                                                        templateWindowSize, searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U and CV_16U are supported for NORM_L1");
+            }
             break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported matrix format! Only uchar, Vec2b, Vec3b are supported");
+                     "Unsupported norm type! Only NORM_L2 and NORM_L1 are supported");
     }
 }
 
@@ -212,9 +372,10 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
     _dst.create(srcImgs[0].size(), srcImgs[0].type());
     Mat dst = _dst.getMat();
 
+    int type = srcImgs[0].type(), depth = CV_MAT_DEPTH(type);
     int src_imgs_size = static_cast<int>(srcImgs.size());
 
-    if (srcImgs[0].type() != CV_8UC3)
+    if (type != CV_8UC3)
     {
         CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3!");
         return;
@@ -228,9 +389,9 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
     std::vector<Mat> ab(src_imgs_size);
     for (int i = 0; i < src_imgs_size; i++)
     {
-        src_lab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC3);
-        l[i] = Mat::zeros(srcImgs[0].size(), CV_8UC1);
-        ab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC2);
+        src_lab[i] = Mat::zeros(srcImgs[0].size(), type);
+        l[i] = Mat::zeros(srcImgs[0].size(), CV_MAKE_TYPE(depth, 1));
+        ab[i] = Mat::zeros(srcImgs[0].size(), CV_MAKE_TYPE(depth, 2));
         cvtColor(srcImgs[i], src_lab[i], COLOR_LBGR2Lab);
 
         Mat l_ab[] = { l[i], ab[i] };
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index b8f5a0392..6e74acf03 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -50,13 +50,13 @@
 
 using namespace cv;
 
-template <typename T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansDenoisingInvoker :
         public ParallelLoopBody
 {
 public:
     FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
-        int template_window_size, int search_window_size, const float h);
+        int template_window_size, int search_window_size, const float *h);
 
     void operator() (const Range& range) const;
 
@@ -75,9 +75,9 @@ private:
     int template_window_half_size_;
     int search_window_half_size_;
 
-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift_;
-    std::vector<int> almost_dist2weight_;
+    std::vector<WT> almost_dist2weight_;
 
     void calcDistSumsForFirstElementInRow(
         int i, Array2d<int>& dist_sums,
@@ -99,15 +99,15 @@ inline int getNearestPowerOf2(int value)
     return p;
 }
 
-template <class T>
-FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansDenoisingInvoker(
     const Mat& src, Mat& dst,
     int template_window_size,
     int search_window_size,
-    const float h) :
+    const float *h) :
     src_(src), dst_(dst)
 {
-    CV_Assert(src.channels() == sizeof(T)); //T is Vec1b or Vec2b or Vec3b
+    CV_Assert(src.channels() == pixelInfo<T>::channels);
 
     template_window_half_size_ = template_window_size / 2;
     search_window_half_size_   = search_window_size   / 2;
@@ -117,8 +117,10 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
     border_size_ = search_window_half_size_ + template_window_half_size_;
     copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT);
 
-    const int max_estimate_sum_value = search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value =
+        (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
+    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
+                                          pixelInfo<WT>::sampleMax());
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -127,30 +129,24 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
     almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
-    int max_dist = 255 * 255 * sizeof(T);
+    int max_dist = D::template maxDist<T>();
     int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight_.resize(almost_max_dist);
 
-    const double WEIGHT_THRESHOLD = 0.001;
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
-
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
-
-        almost_dist2weight_[almost_dist] = weight;
+        almost_dist2weight_[almost_dist] =
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
     }
-    CV_Assert(almost_dist2weight_[0] == fixed_point_mult_);
 
     // additional optimization init end
     if (dst_.empty())
         dst_ = Mat::zeros(src_.size(), src_.type());
 }
 
-template <class T>
-void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
@@ -215,7 +211,7 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                             dist_sums_row[x] -= col_dist_sums_row[x];
 
                             int bx = start_bx + x;
-                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
+                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
 
                             dist_sums_row[x] += col_dist_sums_row[x];
                             up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -227,9 +223,11 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
             }
 
             // calc weights
-            int estimation[3], weights_sum = 0;
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
+            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
+                weights_sum[channel_num] = 0;
 
             for (int y = 0; y < search_window_size_; y++)
             {
@@ -238,24 +236,21 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                 for (int x = 0; x < search_window_size_; x++)
                 {
                     int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
-                    int weight = almost_dist2weight_[almostAvgDist];
-                    weights_sum += weight;
-
+                    WT weight = almost_dist2weight_[almostAvgDist];
                     T p = cur_row_ptr[border_size_ + search_window_x + x];
-                    incWithWeight(estimation, weight, p);
+                    incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum/2) / weights_sum;
-
-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
+                                                                                      weights_sum);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
         }
     }
 }
 
-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
     int i,
     Array2d<int>& dist_sums,
     Array3d<int>& col_dist_sums,
@@ -276,7 +271,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                 {
-                    int dist = calcDist<T>(extended_src_,
+                    int dist = D::template calcDist<T>(extended_src_,
                         border_size_ + i + ty, border_size_ + j + tx,
                         border_size_ + start_y + ty, border_size_ + start_x + tx);
 
@@ -288,8 +283,8 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
         }
 }
 
-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num,
     Array2d<int>& dist_sums,
     Array3d<int>& col_dist_sums,
@@ -312,7 +307,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
             int by = start_by + y;
             int bx = start_bx + x;
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
-                col_dist_sums[new_last_col_num][y][x] += calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
+                col_dist_sums[new_last_col_num][y][x] += D::template calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
 
             dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x];
             up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x];
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index ab7db5d2d..8f31e8b02 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -44,118 +44,438 @@
 
 using namespace cv;
 
-template <typename T> static inline int calcDist(const T a, const T b);
-
-template <> inline int calcDist(const uchar a, const uchar b)
+template <typename T> struct pixelInfo_
 {
-    return (a-b) * (a-b);
+    static const int channels = 1;
+    typedef T sampleType;
+};
+
+template <typename ET, int n> struct pixelInfo_<Vec<ET, n> >
+{
+    static const int channels = n;
+    typedef ET sampleType;
+};
+
+template <typename T> struct pixelInfo: public pixelInfo_<T>
+{
+    using typename pixelInfo_<T>::sampleType;
+
+    static inline sampleType sampleMax()
+    {
+        return std::numeric_limits<sampleType>::max();
+    }
+
+    static inline sampleType sampleMin()
+    {
+        return std::numeric_limits<sampleType>::min();
+    }
+
+    static inline size_t sampleBytes()
+    {
+        return sizeof(sampleType);
+    }
+
+    static inline size_t sampleBits()
+    {
+        return 8*sampleBytes();
+    }
+};
+
+class DistAbs
+{
+    template <typename T> struct calcDist_
+    {
+        static inline int f(const T a, const T b)
+        {
+            return std::abs((int)(a-b));
+        }
+    };
+
+    template <typename ET> struct calcDist_<Vec<ET, 2> >
+    {
+        static inline int f(const Vec<ET, 2> a, const Vec<ET, 2> b)
+        {
+            return std::abs((int)(a[0]-b[0])) + std::abs((int)(a[1]-b[1]));
+        }
+    };
+
+    template <typename ET> struct calcDist_<Vec<ET, 3> >
+    {
+        static inline int f(const Vec<ET, 3> a, const Vec<ET, 3> b)
+        {
+            return
+                std::abs((int)(a[0]-b[0])) +
+                std::abs((int)(a[1]-b[1])) +
+                std::abs((int)(a[2]-b[2]));
+        }
+    };
+
+    template <typename ET> struct calcDist_<Vec<ET, 4> >
+    {
+        static inline int f(const Vec<ET, 4> a, const Vec<ET, 4> b)
+        {
+            return
+                std::abs((int)(a[0]-b[0])) +
+                std::abs((int)(a[1]-b[1])) +
+                std::abs((int)(a[2]-b[2])) +
+                std::abs((int)(a[3]-b[3]));
+        }
+    };
+
+    template <typename T, typename WT> struct calcWeight_
+    {
+        static inline WT f(double dist, const float *h, WT fixed_point_mult)
+        {
+            double w = std::exp(-dist*dist / (h[0]*h[0] * pixelInfo<T>::channels));
+            if (std::isnan(w)) w = 1.0; // Handle h = 0.0
+
+            static const double WEIGHT_THRESHOLD = 0.001;
+            WT weight = (WT)round(fixed_point_mult * w);
+            if (weight < WEIGHT_THRESHOLD * fixed_point_mult) weight = 0;
+
+            return weight;
+        }
+    };
+
+    template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
+    {
+        static inline Vec<ET, n> f(double dist, const float *h, ET fixed_point_mult)
+        {
+            Vec<ET, n> res;
+            for (int i=0; i<n; i++)
+                res[i] = calcWeight<T, ET>(dist, &h[i], fixed_point_mult);
+            return res;
+        }
+    };
+
+public:
+    template <typename T> static inline int calcDist(const T a, const T b)
+    {
+        return calcDist_<T>::f(a, b);
+    }
+
+    template <typename T>
+    static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+    {
+        const T a = m.at<T>(i1, j1);
+        const T b = m.at<T>(i2, j2);
+        return calcDist<T>(a,b);
+    }
+
+    template <typename T>
+    static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+    {
+        return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
+    };
+
+    template <typename T, typename WT>
+    static inline WT calcWeight(double dist, const float *h,
+                                typename pixelInfo<WT>::sampleType fixed_point_mult)
+    {
+        return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
+    }
+
+    template <typename T>
+    static inline int maxDist()
+    {
+        return (int)pixelInfo<T>::sampleMax() * pixelInfo<T>::channels;
+    }
+};
+
+class DistSquared
+{
+    template <typename T> struct calcDist_
+    {
+        static inline int f(const T a, const T b)
+        {
+            return (int)(a-b) * (int)(a-b);
+        }
+    };
+
+    template <typename ET> struct calcDist_<Vec<ET, 2> >
+    {
+        static inline int f(const Vec<ET, 2> a, const Vec<ET, 2> b)
+        {
+            return (int)(a[0]-b[0])*(int)(a[0]-b[0]) + (int)(a[1]-b[1])*(int)(a[1]-b[1]);
+        }
+    };
+
+    template <typename ET> struct calcDist_<Vec<ET, 3> >
+    {
+        static inline int f(const Vec<ET, 3> a, const Vec<ET, 3> b)
+        {
+            return
+                (int)(a[0]-b[0])*(int)(a[0]-b[0]) +
+                (int)(a[1]-b[1])*(int)(a[1]-b[1]) +
+                (int)(a[2]-b[2])*(int)(a[2]-b[2]);
+        }
+    };
+
+    template <typename ET> struct calcDist_<Vec<ET, 4> >
+    {
+        static inline int f(const Vec<ET, 4> a, const Vec<ET, 4> b)
+        {
+            return
+                (int)(a[0]-b[0])*(int)(a[0]-b[0]) +
+                (int)(a[1]-b[1])*(int)(a[1]-b[1]) +
+                (int)(a[2]-b[2])*(int)(a[2]-b[2]) +
+                (int)(a[3]-b[3])*(int)(a[3]-b[3]);
+        }
+    };
+
+    template <typename T> struct calcUpDownDist_
+    {
+        static inline int f(T a_up, T a_down, T b_up, T b_down)
+        {
+            int A = a_down - b_down;
+            int B = a_up - b_up;
+            return (A-B)*(A+B);
+        }
+    };
+
+    template <typename ET, int n> struct calcUpDownDist_<Vec<ET, n> >
+    {
+    private:
+        typedef Vec<ET, n> T;
+    public:
+        static inline int f(T a_up, T a_down, T b_up, T b_down)
+        {
+            return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
+        }
+    };
+
+    template <typename T, typename WT> struct calcWeight_
+    {
+        static inline WT f(double dist, const float *h, WT fixed_point_mult)
+        {
+            double w = std::exp(-dist / (h[0]*h[0] * pixelInfo<T>::channels));
+            if (std::isnan(w)) w = 1.0; // Handle h = 0.0
+
+            static const double WEIGHT_THRESHOLD = 0.001;
+            WT weight = (WT)round(fixed_point_mult * w);
+            if (weight < WEIGHT_THRESHOLD * fixed_point_mult) weight = 0;
+
+            return weight;
+        }
+    };
+
+    template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
+    {
+        static inline Vec<ET, n> f(double dist, const float *h, ET fixed_point_mult)
+        {
+            Vec<ET, n> res;
+            for (int i=0; i<n; i++)
+                res[i] = calcWeight<T, ET>(dist, &h[i], fixed_point_mult);
+            return res;
+        }
+    };
+
+public:
+    template <typename T> static inline int calcDist(const T a, const T b)
+    {
+        return calcDist_<T>::f(a, b);
+    }
+
+    template <typename T>
+    static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+    {
+        const T a = m.at<T>(i1, j1);
+        const T b = m.at<T>(i2, j2);
+        return calcDist<T>(a,b);
+    }
+
+    template <typename T>
+    static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+    {
+        return calcUpDownDist_<T>::f(a_up, a_down, b_up, b_down);
+    };
+
+    template <typename T, typename WT>
+    static inline WT calcWeight(double dist, const float *h,
+                                typename pixelInfo<WT>::sampleType fixed_point_mult)
+    {
+        return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
+    }
+
+    template <typename T>
+    static inline int maxDist()
+    {
+        return (int)pixelInfo<T>::sampleMax() * (int)pixelInfo<T>::sampleMax() *
+            pixelInfo<T>::channels;
+    }
+};
+
+template <typename T, typename IT, typename WT> struct incWithWeight_
+{
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, T p)
+    {
+        estimation[0] += (IT)weight * p;
+        weights_sum[0] += (IT)weight;
+    }
+};
+
+template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 2>, IT, WT>
+{
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 2> p)
+    {
+        estimation[0] += (IT)weight * p[0];
+        estimation[1] += (IT)weight * p[1];
+        weights_sum[0] += (IT)weight;
+    }
+};
+
+template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 3>, IT, WT>
+{
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 3> p)
+    {
+        estimation[0] += (IT)weight * p[0];
+        estimation[1] += (IT)weight * p[1];
+        estimation[2] += (IT)weight * p[2];
+        weights_sum[0] += (IT)weight;
+    }
+};
+
+template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 4>, IT, WT>
+{
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 4> p)
+    {
+        estimation[0] += (IT)weight * p[0];
+        estimation[1] += (IT)weight * p[1];
+        estimation[2] += (IT)weight * p[2];
+        estimation[3] += (IT)weight * p[3];
+        weights_sum[0] += (IT)weight;
+    }
+};
+
+template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 2>, IT, Vec<EW, 2> >
+{
+    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 2> weight, Vec<ET, 2> p)
+    {
+        estimation[0] += (IT)weight[0] * p[0];
+        estimation[1] += (IT)weight[1] * p[1];
+        weights_sum[0] += (IT)weight[0];
+        weights_sum[1] += (IT)weight[1];
+    }
+};
+
+template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 3>, IT, Vec<EW, 3> >
+{
+    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 3> weight, Vec<ET, 3> p)
+    {
+        estimation[0] += (IT)weight[0] * p[0];
+        estimation[1] += (IT)weight[1] * p[1];
+        estimation[2] += (IT)weight[2] * p[2];
+        weights_sum[0] += (IT)weight[0];
+        weights_sum[1] += (IT)weight[1];
+        weights_sum[2] += (IT)weight[2];
+    }
+};
+
+template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 4>, IT, Vec<EW, 4> >
+{
+    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 4> weight, Vec<ET, 4> p)
+    {
+        estimation[0] += (IT)weight[0] * p[0];
+        estimation[1] += (IT)weight[1] * p[1];
+        estimation[2] += (IT)weight[2] * p[2];
+        estimation[3] += (IT)weight[3] * p[3];
+        weights_sum[0] += (IT)weight[0];
+        weights_sum[1] += (IT)weight[1];
+        weights_sum[2] += (IT)weight[2];
+        weights_sum[3] += (IT)weight[3];
+    }
+};
+
+template <typename T, typename IT, typename WT>
+static inline void incWithWeight(IT* estimation, IT* weights_sum, WT weight, T p)
+{
+    return incWithWeight_<T, IT, WT>::f(estimation, weights_sum, weight, p);
 }
 
-template <> inline int calcDist(const Vec2b a, const Vec2b b)
+template <typename IT, typename UIT, int nc, int nw> struct divByWeightsSum_
 {
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]);
+    static inline void f(IT* estimation, IT* weights_sum);
+};
+
+template <typename IT, typename UIT> struct divByWeightsSum_<IT, UIT, 1, 1>
+{
+    static inline void f(IT* estimation, IT* weights_sum)
+    {
+        estimation[0] = (static_cast<UIT>(estimation[0]) + weights_sum[0]/2) / weights_sum[0];
+    }
+};
+
+template <typename IT, typename UIT, int n> struct divByWeightsSum_<IT, UIT, n, 1>
+{
+    static inline void f(IT* estimation, IT* weights_sum)
+    {
+        for (size_t i = 0; i < n; i++)
+            estimation[i] = (static_cast<UIT>(estimation[i]) + weights_sum[0]/2) / weights_sum[0];
+    }
+};
+
+template <typename IT, typename UIT, int n> struct divByWeightsSum_<IT, UIT, n, n>
+{
+    static inline void f(IT* estimation, IT* weights_sum)
+    {
+        for (size_t i = 0; i < n; i++)
+            estimation[i] = (static_cast<UIT>(estimation[i]) + weights_sum[i]/2) / weights_sum[i];
+    }
+};
+
+template <typename IT, typename UIT, int nc, int nw>
+static inline void divByWeightsSum(IT* estimation, IT* weights_sum)
+{
+    return divByWeightsSum_<IT, UIT, nc, nw>::f(estimation, weights_sum);
 }
 
-template <> inline int calcDist(const Vec3b a, const Vec3b b)
+template <typename T, typename IT> struct saturateCastFromArray_
 {
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]) + (a[2]-b[2])*(a[2]-b[2]);
-}
+    static inline T f(IT* estimation)
+    {
+        return saturate_cast<T>(estimation[0]);
+    }
+};
 
-template <typename T> static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 2>, IT>
 {
-    const T a = m.at<T>(i1, j1);
-    const T b = m.at<T>(i2, j2);
-    return calcDist<T>(a,b);
-}
+    static inline Vec<ET, 2> f(IT* estimation)
+    {
+        Vec<ET, 2> res;
+        res[0] = saturate_cast<ET>(estimation[0]);
+        res[1] = saturate_cast<ET>(estimation[1]);
+        return res;
+    }
+};
 
-template <typename T> static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 3>, IT>
 {
-    return calcDist(a_down, b_down) - calcDist(a_up, b_up);
-}
+    static inline Vec<ET, 3> f(IT* estimation)
+    {
+        Vec<ET, 3> res;
+        res[0] = saturate_cast<ET>(estimation[0]);
+        res[1] = saturate_cast<ET>(estimation[1]);
+        res[2] = saturate_cast<ET>(estimation[2]);
+        return res;
+    }
+};
 
-template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar  b_up, uchar b_down)
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 4>, IT>
 {
-    int A = a_down - b_down;
-    int B = a_up - b_up;
-    return (A-B)*(A+B);
-}
+    static inline Vec<ET, 4> f(IT* estimation)
+    {
+        Vec<ET, 4> res;
+        res[0] = saturate_cast<ET>(estimation[0]);
+        res[1] = saturate_cast<ET>(estimation[1]);
+        res[2] = saturate_cast<ET>(estimation[2]);
+        res[3] = saturate_cast<ET>(estimation[3]);
+        return res;
+    }
+};
 
-template <typename T> static inline void incWithWeight(int* estimation, int weight, T p);
-
-template <> inline void incWithWeight(int* estimation, int weight, uchar p)
+template <typename T, typename IT> static inline T saturateCastFromArray(IT* estimation)
 {
-    estimation[0] += weight * p;
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, Vec2b p)
-{
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, Vec3b p)
-{
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-    estimation[2] += weight * p[2];
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, int p)
-{
-    estimation[0] += weight * p;
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, Vec2i p)
-{
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, Vec3i p)
-{
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-    estimation[2] += weight * p[2];
-}
-
-template <typename T> static inline T saturateCastFromArray(int* estimation);
-
-template <> inline uchar saturateCastFromArray(int* estimation)
-{
-    return saturate_cast<uchar>(estimation[0]);
-}
-
-template <> inline Vec2b saturateCastFromArray(int* estimation)
-{
-    Vec2b res;
-    res[0] = saturate_cast<uchar>(estimation[0]);
-    res[1] = saturate_cast<uchar>(estimation[1]);
-    return res;
-}
-
-template <> inline Vec3b saturateCastFromArray(int* estimation)
-{
-    Vec3b res;
-    res[0] = saturate_cast<uchar>(estimation[0]);
-    res[1] = saturate_cast<uchar>(estimation[1]);
-    res[2] = saturate_cast<uchar>(estimation[2]);
-    return res;
-}
-
-template <> inline int saturateCastFromArray(int* estimation)
-{
-    return estimation[0];
-}
-
-template <> inline Vec2i saturateCastFromArray(int* estimation)
-{
-    estimation[1] = 0;
-    return Vec2i(estimation);
-}
-
-template <> inline Vec3i saturateCastFromArray(int* estimation)
-{
-    return Vec3i(estimation);
+    return saturateCastFromArray_<T, IT>::f(estimation);
 }
 
 #endif
diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index 1cdd8fa49..1c511f37b 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -28,12 +28,16 @@ static int divUp(int a, int b)
     return (a + b - 1) / b;
 }
 
-template <typename FT>
-static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
+template <typename FT, typename ST, typename WT>
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
+                                      int searchWindowSize, int templateWindowSize,
+                                      const FT *h, int hn, int cn, int normType,
                                       int & almostTemplateWindowSizeSqBinShift)
 {
-    const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255;
-    int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue;
+    const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
+        std::numeric_limits<ST>::max();
+    int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
+                                           std::numeric_limits<int>::max());
     int depth = DataType<FT>::depth;
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
@@ -48,33 +52,44 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
     FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
 
     const FT WEIGHT_THRESHOLD = 1e-3f;
-    int maxDist = 255 * 255 * cn;
+    int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
+        std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
     int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
-    FT den = 1.0f / (h * h * cn);
+    FT den[4];
+    CV_Assert(hn > 0 && hn <= 4);
+    for (int i=0; i<hn; i++)
+        den[i] = 1.0f / (h[i] * h[i] * cn);
 
-    almostDist2Weight.create(1, almostMaxDist, CV_32SC1);
+    almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
 
+    char buf[40];
     ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
-                  format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                  format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
+                         " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
+                         ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         normType == NORM_L1 ? " -D ABS" : ""));
     if (k.empty())
         return false;
 
     k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
-           almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD);
+           almostDist2ActualDistMultiplier, fixedPointMult,
+           ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
 
     size_t globalsize[1] = { almostMaxDist };
     return k.run(1, globalsize, NULL, false);
 }
 
-static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
-                                     int templateWindowSize, int searchWindowSize)
+static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
+                                     int templateWindowSize, int searchWindowSize, int normType)
 {
-    int type = _src.type(), cn = CV_MAT_CN(type);
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
     Size size = _src.size();
 
-    if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
+    if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
+                             (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
         return false;
 
     int templateWindowHalfWize = templateWindowSize / 2;
@@ -84,33 +99,68 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
     int almostTemplateWindowSizeSqBinShift = -1;
 
-    char cvt[2][40];
+    char buf[4][40];
     String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
-                         " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
+                         " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
+                         " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
+                         " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
-                         " -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s",
-                         templateWindowSize, searchWindowSize, ocl::typeToStr(type),
-                         ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize,
-                         templateWindowHalfWize, searchWindowHalfSize,
-                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
-                         ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
+                         " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
+                         templateWindowSize, searchWindowSize,
+                         ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
+                         ocl::typeToStr(CV_32SC(hn)),
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
+                         format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
+                         format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
+                         format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
+                         format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
+                         BLOCK_COLS, BLOCK_ROWS,
+                         ctaSize, templateWindowHalfWize, searchWindowHalfSize,
+                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
+                         (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
+                         ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
+                         normType == NORM_L1 ? " -D ABS" : "");
 
     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
     if (k.empty())
         return false;
 
     UMat almostDist2Weight;
-    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn,
-                                   almostTemplateWindowSizeSqBinShift))
+    if ((depth == CV_8U &&
+         !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
+                                                       searchWindowSize, templateWindowSize,
+                                                       h, hn, cn, normType,
+                                                       almostTemplateWindowSizeSqBinShift)) ||
+        (depth == CV_16U &&
+         !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
+                                                          searchWindowSize, templateWindowSize,
+                                                          h, hn, cn, normType,
+                                                          almostTemplateWindowSizeSqBinShift)))
         return false;
     CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
 
     UMat srcex;
     int borderSize = searchWindowHalfSize + templateWindowHalfWize;
-    copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+    if (cn == 3) {
+        srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
+        UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
+        int from_to[] = { 0,0, 1,1, 2,2 };
+        mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
+        copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
+                       BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
+    }
+    else
+        copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
 
     _dst.create(size, type);
-    UMat dst = _dst.getUMat();
+    UMat dst;
+    if (cn == 3)
+        dst.create(size, CV_MAKE_TYPE(depth, 4));
+    else
+        dst = _dst.getUMat();
 
     int searchWindowSizeSq = searchWindowSize * searchWindowSize;
     Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
@@ -123,7 +173,14 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
            ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
 
     size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
-    return k.run(2, globalsize, localsize, false);
+    if (!k.run(2, globalsize, localsize, false)) return false;
+
+    if (cn == 3) {
+        int from_to[] = { 0,0, 1,1, 2,2 };
+        mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
+    }
+
+    return true;
 }
 
 static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 191a67127..3f13f400d 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -50,14 +50,14 @@
 
 using namespace cv;
 
-template <typename T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansMultiDenoisingInvoker :
         ParallelLoopBody
 {
 public:
     FastNlMeansMultiDenoisingInvoker(const std::vector<Mat>& srcImgs, int imgToDenoiseIndex,
                                      int temporalWindowSize, Mat& dst, int template_window_size,
-                                     int search_window_size, const float h);
+                                     int search_window_size, const float *h);
 
     void operator() (const Range& range) const;
 
@@ -81,9 +81,9 @@ private:
     int search_window_half_size_;
     int temporal_window_half_size_;
 
-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift;
-    std::vector<int> almost_dist2weight;
+    std::vector<WT> almost_dist2weight;
 
     void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
                                           Array4d<int>& col_dist_sums,
@@ -94,19 +94,19 @@ private:
                                           Array4d<int>& up_col_dist_sums) const;
 };
 
-template <class T>
-FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansMultiDenoisingInvoker(
     const std::vector<Mat>& srcImgs,
     int imgToDenoiseIndex,
     int temporalWindowSize,
     cv::Mat& dst,
     int template_window_size,
     int search_window_size,
-    const float h) :
+    const float *h) :
         dst_(dst), extended_srcs_(srcImgs.size())
 {
     CV_Assert(srcImgs.size() > 0);
-    CV_Assert(srcImgs[0].channels() == sizeof(T));
+    CV_Assert(srcImgs[0].channels() == pixelInfo<T>::channels);
 
     rows_ = srcImgs[0].rows;
     cols_ = srcImgs[0].cols;
@@ -125,8 +125,10 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
             border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT);
 
     main_extended_src_ = extended_srcs_[temporal_window_half_size_];
-    const int max_estimate_sum_value = temporal_window_size_ * search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value =
+        (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
+    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
+                                          pixelInfo<WT>::sampleMax());
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -138,30 +140,24 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
 
-    int max_dist = 255 * 255 * sizeof(T);
-    int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
+    int max_dist = D::template maxDist<T>();
+    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight.resize(almost_max_dist);
 
-    const double WEIGHT_THRESHOLD = 0.001;
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
-
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
-
-        almost_dist2weight[almost_dist] = weight;
+        almost_dist2weight[almost_dist] =
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
     }
-    CV_Assert(almost_dist2weight[0] == fixed_point_mult_);
 
     // additional optimization init end
     if (dst_.empty())
         dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type());
 }
 
-template <class T>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
@@ -234,7 +230,7 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                                 dist_sums_row[x] -= col_dist_sums_row[x];
 
                                 col_dist_sums_row[x] = up_col_dist_sums_row[x] +
-                                    calcUpDownDist(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
+                                    D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
 
                                 dist_sums_row[x] += col_dist_sums_row[x];
                                 up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -247,11 +243,11 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
             }
 
             // calc weights
-            int weights_sum = 0;
-
-            int estimation[3];
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
+            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
+                weights_sum[channel_num] = 0;
 
             for (int d = 0; d < temporal_window_size_; d++)
             {
@@ -266,26 +262,22 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                     {
                         int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
 
-                        int weight = almost_dist2weight[almostAvgDist];
-                        weights_sum += weight;
-
+                        WT weight =  almost_dist2weight[almostAvgDist];
                         T p = cur_row_ptr[border_size_ + search_window_x + x];
-                        incWithWeight(estimation, weight, p);
+                        incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                     }
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum / 2) / weights_sum;
-
-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
-
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
+                                                                                      weights_sum);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
         }
     }
 }
 
-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
         int i, Array3d<int>& dist_sums, Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
     int j = 0;
@@ -310,7 +302,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
                 {
                     for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                     {
-                        int dist = calcDist<T>(
+                        int dist = D::template calcDist<T>(
                                     main_extended_src_.at<T>(border_size_ + i + ty, border_size_ + j + tx),
                                     cur_extended_src.at<T>(border_size_ + start_y + ty, border_size_ + start_x + tx));
 
@@ -325,8 +317,8 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
     }
 }
 
-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num, Array3d<int>& dist_sums,
     Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
@@ -353,7 +345,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRo
                 int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
                 for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 {
-                    *col_dist_sums_ptr += calcDist<T>(
+                    *col_dist_sums_ptr += D::template calcDist<T>(
                                 main_extended_src_.at<T>(ay + ty, ax),
                                 cur_extended_src.at<T>(by + ty, bx));
                 }
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl
index af3fb1f9b..879665f48 100644
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -20,21 +20,23 @@
 
 #ifdef OP_CALC_WEIGHTS
 
-__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist,
+__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,
                                     FT almostDist2ActualDistMultiplier, int fixedPointMult,
-                                    FT den, FT WEIGHT_THRESHOLD)
+                                    w_t den, FT WEIGHT_THRESHOLD)
 {
     int almostDist = get_global_id(0);
 
     if (almostDist < almostMaxDist)
     {
         FT dist = almostDist * almostDist2ActualDistMultiplier;
-        int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
-
-        if (weight < WEIGHT_THRESHOLD * fixedPointMult)
-            weight = 0;
-
-        almostDist2Weight[almostDist] = weight;
+#ifdef ABS
+        w_t w = exp((w_t)(-dist*dist) * den);
+#else
+        w_t w = exp((w_t)(-dist) * den);
+#endif
+        wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));
+        almostDist2Weight[almostDist] =
+            weight < (wlut_t)(WEIGHT_THRESHOLD * fixedPointMult) ? (wlut_t)0 : weight;
     }
 }
 
@@ -44,21 +46,35 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
 
 #define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)
 
-inline int calcDist(uchar_t a, uchar_t b)
+inline int calcDist(pixel_t a, pixel_t b)
 {
+#ifdef ABS
+    int_t retval = convert_int_t(abs_diff(a, b));
+#else
     int_t diff = convert_int_t(a) - convert_int_t(b);
     int_t retval = diff * diff;
+#endif
 
 #if cn == 1
     return retval;
 #elif cn == 2
     return retval.x + retval.y;
+#elif cn == 3
+    return retval.x + retval.y + retval.z;
+#elif cn == 4
+    return retval.x + retval.y + retval.z + retval.w;
 #else
-#error "cn should be either 1 or 2"
+#error "cn should be either 1, 2, 3 or 4"
 #endif
 }
 
-inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t)
+#ifdef ABS
+inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
+{
+    return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
+}
+#else
+inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
 {
     int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);
     int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);
@@ -68,10 +84,15 @@ inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_v
     return retval;
 #elif cn == 2
     return retval.x + retval.y;
+#elif cn == 3
+    return retval.x + retval.y + retval.z;
+#elif cn == 4
+    return retval.x + retval.y + retval.z + retval.w;
 #else
-#error "cn should be either 1 or 2"
+#error "cn should be either 1, 2, 3 or 4"
 #endif
 }
+#endif
 
 #define COND if (x == 0 && y == 0)
 
@@ -87,9 +108,9 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
     {
         int dist = 0, value;
 
-        __global const uchar_t * src_template = (__global const uchar_t *)(src +
-            mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
-        __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
+        __global const pixel_t * src_template = (__global const pixel_t *)(src +
+            mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
         __global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
 
         #pragma unroll
@@ -107,8 +128,8 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
                 dist += value;
             }
 
-            src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
-            src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
+            src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
+            src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
         }
 
         #pragma unroll
@@ -130,9 +151,9 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
 
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
-        __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
-        __global const uchar_t * src_template = (__global const uchar_t *)(src +
-            mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
+        __global const pixel_t * src_template = (__global const pixel_t *)(src +
+            mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
         __global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
 
         int col_dist = 0;
@@ -142,8 +163,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
         {
             col_dist += calcDist(src_current[0], src_template[0]);
 
-            src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
-            src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
+            src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
+            src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
         }
 
         dists[i] += col_dist - col_dists_current[first];
@@ -160,8 +181,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
     int sy_up = y - TEMPLATE_SIZE2 - 1;
     int sy_down = y + TEMPLATE_SIZE2;
 
-    uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
-    uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
+    pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));
+    pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));
 
     sx -= SEARCH_SIZE2;
     sy_up -= SEARCH_SIZE2;
@@ -171,8 +192,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
     {
         int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
 
-        uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
-        uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
+        pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));
+        pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));
 
         __global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
         __global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
@@ -186,24 +207,25 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
 }
 
 inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
-                           __local int * dists, __global const int * almostDist2Weight,
+                           __local int * dists, __global const wlut_t * almostDist2Weight,
                            __global uchar * dst, int dst_step, int dst_offset,
-                           int y, int x, int id, __local int * weights_local,
-                           __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
+                           int y, int x, int id, __local weight_t * weights_local,
+                           __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
 {
-    int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0;
-    int_t weighted_sum = (int_t)(0);
+    int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
+    weight_t weights = (weight_t)0;
+    sum_t weighted_sum = (sum_t)0;
 
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
-        int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
-        int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index));
+        int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));
+        sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
 
         int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
-        int weight = almostDist2Weight[almostAvgDist];
+        weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);
 
         weights += weight;
-        weighted_sum += (int_t)(weight) * src_value;
+        weighted_sum += (sum_t)weight * src_value;
     }
 
     weights_local[id] = weights;
@@ -223,26 +245,27 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
 
     if (id == 0)
     {
-        int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
-        int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
+        int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));
+        sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
             weighted_sum_local[2] + weighted_sum_local[3];
-        int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
+        weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
 
-        *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / (int_t)(weights_local_0));
+        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);
     }
 }
 
 __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
                                    __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                                   __global const int * almostDist2Weight, __global uchar * buffer,
+                                   __global const wlut_t * almostDist2Weight, __global uchar * buffer,
                                    int almostTemplateWindowSizeSqBinShift)
 {
     int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
     int block_y = get_group_id(1);
     int id = get_local_id(0), first;
 
-    __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE];
-    __local int_t weighted_sum[CTA_SIZE];
+    __local int dists[SEARCH_SIZE_SQ];
+    __local weight_t weights[CTA_SIZE];
+    __local sum_t weighted_sum[CTA_SIZE];
 
     int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
     int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index cb2d74f85..f749564c6 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -13,11 +13,11 @@
 namespace cvtest {
 namespace ocl {
 
-PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
+PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, int, bool, bool)
 {
-    int cn, templateWindowSize, searchWindowSize;
-    float h;
-    bool use_roi;
+    int cn, normType, templateWindowSize, searchWindowSize;
+    std::vector<float> h;
+    bool use_roi, use_image;
 
     TEST_DECLARE_INPUT_PARAMETER(src);
     TEST_DECLARE_OUTPUT_PARAMETER(dst);
@@ -25,29 +25,46 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
     virtual void SetUp()
     {
         cn = GET_PARAM(0);
-        use_roi = GET_PARAM(1);
+        normType = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+        use_image = GET_PARAM(3);
 
         templateWindowSize = 7;
         searchWindowSize = 21;
-        h = 3.0f;
+
+        h.resize(cn);
+        for (int i=0; i<cn; i++)
+            h[i] = 3.0f + 0.5f*i;
     }
 
     virtual void generateTestData()
     {
+        const int type = CV_8UC(cn);
         Mat image;
-        if (cn == 1)
-        {
-            image = readImage("denoising/lena_noised_gaussian_sigma=10.png", IMREAD_GRAYSCALE);
+
+        if (use_image) {
+            image = readImage("denoising/lena_noised_gaussian_sigma=10.png",
+                                  cn == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
             ASSERT_FALSE(image.empty());
         }
 
-        const int type = CV_8UC(cn);
-
-        Size roiSize = cn == 1 ? image.size() : randomSize(1, MAX_VALUE);
+        Size roiSize = use_image ? image.size() : randomSize(1, MAX_VALUE);
         Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
-        if (cn == 1)
-            image.copyTo(src_roi);
+        if (use_image) {
+            ASSERT_TRUE(cn > 0 && cn <= 4);
+            if (cn == 2) {
+                int from_to[] = { 0,0, 1,1 };
+                src_roi.create(roiSize, type);
+                mixChannels(&image, 1, &src_roi, 1, from_to, 2);
+            }
+            else if (cn == 4) {
+                int from_to[] = { 0,0, 1,1, 2,2, 1,3};
+                src_roi.create(roiSize, type);
+                mixChannels(&image, 1, &src_roi, 1, from_to, 4);
+            }
+            else image.copyTo(src_roi);
+        }
 
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255);
@@ -65,8 +82,23 @@ OCL_TEST_P(FastNlMeansDenoising, Mat)
     {
         generateTestData();
 
-        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1);
+    }
+}
+
+typedef FastNlMeansDenoisingTestBase FastNlMeansDenoising_hsep;
+
+OCL_TEST_P(FastNlMeansDenoising_hsep, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize, normType));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize, normType));
 
         OCL_EXPECT_MATS_NEAR(dst, 1);
     }
@@ -80,15 +112,21 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
     {
         generateTestData();
 
-        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h[0], h[0], templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h[0], h[0], templateWindowSize, searchWindowSize));
 
         OCL_EXPECT_MATS_NEAR(dst, 1);
     }
 }
 
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool()));
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored, Combine(Values(3, 4), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
+                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
+                                    Bool(), Values(true)));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising_hsep,
+                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
+                                    Bool(), Values(true)));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
+                            Combine(Values(3, 4), Values((int)NORM_L2), Bool(), Values(false)));
 
 } } // namespace cvtest::ocl