Merge pull request #3814 from erikrk:denoising-16bit-master

2015-03-24 15:23:58 +00:00
parent 7ea0239776 01d3df0d00
commit 5501cfd809
9 changed files with 977 additions and 318 deletions
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -442,6 +442,10 @@ template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(
 template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
 //! @cond IGNORED
@@ -452,6 +456,8 @@ template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_c
 template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
 template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
 template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
 template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
 template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
@@ -460,6 +466,8 @@ template<> inline schar saturate_cast<schar>(short v)        { return saturate_c
 template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
 template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
 template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
 template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
 template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
 template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
@@ -467,12 +475,16 @@ template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((
 template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
 template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
 template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
 template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
 template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
 template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
 template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
 template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
 template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
 template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
 template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
 template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
 template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
 template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -119,7 +119,7 @@ CV_EXPORTS_W void inpaint( InputArray src, InputArray inpaintMask,
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
 optimizations. Noise expected to be a gaussian white noise
-@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
@param dst Output image with the same size and type as src .
@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@@ -138,6 +138,35 @@ parameter.
 CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h = 3,
        int templateWindowSize = 7, int searchWindowSize = 21);
 /** @brief Perform image denoising using Non-local Means Denoising algorithm
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
 optimizations. Noise expected to be a gaussian white noise
@param src Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
 2-channel, 3-channel or 4-channel image.
@param dst Output image with the same size and type as src .
@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
@param h Array of parameters regulating filter strength, either one
 parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
 This function expected to be applied to grayscale images. For colored images look at
 fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
 image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
 image to CIELAB colorspace and then separately denoise L and AB components with different h
 parameter.
 */
 CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
                                        const std::vector<float>& h,
                                        int templateWindowSize = 7, int searchWindowSize = 21,
                                        int normType = NORM_L2);
 /** @brief Modification of fastNlMeansDenoising function for colored images
@param src Input 8-bit 3-channel image.
@@ -165,7 +194,35 @@ captured in small period of time. For example video. This version of the functio
 images or for manual manipulation with colorspaces. For more details see
 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
-@param srcImgs Input 8-bit 1-channel, 2-channel or 3-channel images sequence. All images should
+@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
 4-channel images sequence. All images should have the same type and
 size.
@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
 imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
 srcImgs[imgToDenoiseIndex] image.
@param dst Output image with the same size and type as srcImgs images.
@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
@param h Parameter regulating filter strength. Bigger h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
 */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
        int imgToDenoiseIndex, int temporalWindowSize,
        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
 /** @brief Modification of fastNlMeansDenoising function for images sequence where consequtive images have been
 captured in small period of time. For example video. This version of the function is for grayscale
 images or for manual manipulation with colorspaces. For more details see
 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
@param srcImgs Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
 2-channel, 3-channel or 4-channel images sequence. All images should
 have the same type and size.
@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
@@ -178,13 +235,17 @@ Should be odd. Recommended value 7 pixels
@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
+@param h Array of parameters regulating filter strength, either one
-removes noise but also removes image details, smaller h value preserves details but also preserves
+parameter applied to all channels or one per channel in dst. Big h value
-some noise
+perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
 */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
-        int imgToDenoiseIndex, int temporalWindowSize,
+                                             int imgToDenoiseIndex, int temporalWindowSize,
-        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+                                             const std::vector<float>& h,
                                             int templateWindowSize = 7, int searchWindowSize = 21,
                                             int normType = NORM_L2);
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -45,42 +45,115 @@
 #include "fast_nlmeans_multi_denoising_invoker.hpp"
 #include "fast_nlmeans_denoising_opencl.hpp"
 template<typename ST, typename IT, typename UIT, typename D>
 static void fastNlMeansDenoising_( const Mat& src, Mat& dst, const std::vector<float>& h,
                                   int templateWindowSize, int searchWindowSize)
 {
    int hn = (int)h.size();
    switch (CV_MAT_CN(src.type())) {
        case 1:
            parallel_for_(cv::Range(0, src.rows),
                          FastNlMeansDenoisingInvoker<ST, IT, UIT, D, int>(
                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        case 2:
            if (hn == 1)
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
            else
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        case 3:
            if (hn == 1)
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
            else
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        case 4:
            if (hn == 1)
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
            else
                parallel_for_(cv::Range(0, src.rows),
                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        default:
            CV_Error(Error::StsBadArg,
                     "Unsupported number of channels! Only 1, 2, 3, and 4 are supported");
    }
 }
 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
                               int templateWindowSize, int searchWindowSize)
 {
    fastNlMeansDenoising(_src, _dst, std::vector<float>(1, h),
                         templateWindowSize, searchWindowSize);
 }
 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, const std::vector<float>& h,
                               int templateWindowSize, int searchWindowSize, int normType)
 {
    int hn = (int)h.size(), type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    CV_Assert(hn == 1 || hn == cn);
    Size src_size = _src.size();
    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
               src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize))
+               ocl_fastNlMeansDenoising(_src, _dst, &h[0], hn,
                                        templateWindowSize, searchWindowSize, normType))
    Mat src = _src.getMat();
    _dst.create(src_size, src.type());
    Mat dst = _dst.getMat();
    switch (normType) {
        case NORM_L2:
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(tegra::useTegra() && tegra::fastNlMeansDenoising(src, dst, h, templateWindowSize, searchWindowSize))
+            if(hn == 1 && tegra::useTegra() &&
-        return;
+               tegra::fastNlMeansDenoising(src, dst, h[0], templateWindowSize, searchWindowSize))
                return;
 #endif
-
+            switch (depth) {
-    switch (src.type()) {
+                case CV_8U:
-        case CV_8U:
+                    fastNlMeansDenoising_<uchar, int, unsigned, DistSquared>(src, dst, h,
-            parallel_for_(cv::Range(0, src.rows),
+                                                                             templateWindowSize,
-                FastNlMeansDenoisingInvoker<uchar>(
+                                                                             searchWindowSize);
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    break;
                default:
                    CV_Error(Error::StsBadArg,
                             "Unsupported depth! Only CV_8U is supported for NORM_L2");
            }
            break;
-        case CV_8UC2:
+        case NORM_L1:
-            parallel_for_(cv::Range(0, src.rows),
+            switch (depth) {
-                FastNlMeansDenoisingInvoker<cv::Vec2b>(
+                case CV_8U:
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    fastNlMeansDenoising_<uchar, int, unsigned, DistAbs>(src, dst, h,
-            break;
+                                                                         templateWindowSize,
-        case CV_8UC3:
+                                                                         searchWindowSize);
-            parallel_for_(cv::Range(0, src.rows),
+                    break;
-                FastNlMeansDenoisingInvoker<cv::Vec3b>(
+                case CV_16U:
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    fastNlMeansDenoising_<ushort, int64, uint64, DistAbs>(src, dst, h,
                                                                          templateWindowSize,
                                                                          searchWindowSize);
                    break;
                default:
                    CV_Error(Error::StsBadArg,
                             "Unsupported depth! Only CV_8U and CV_16U are supported for NORM_L1");
            }
            break;
        default:
            CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8UC1, CV_8UC2 and CV_8UC3 are supported");
+                     "Unsupported norm type! Only NORM_L2 and NORM_L1 are supported");
    }
 }
@@ -92,7 +165,7 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
    Size src_size = _src.size();
    if (type != CV_8UC3 && type != CV_8UC4)
    {
-        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3!");
+        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3 or CV_8UC4!");
        return;
    }
@@ -108,8 +181,8 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
    Mat src_lab;
    cvtColor(src, src_lab, COLOR_LBGR2Lab);
-    Mat l(src_size, CV_8U);
+    Mat l(src_size, CV_MAKE_TYPE(depth, 1));
-    Mat ab(src_size, CV_8UC2);
+    Mat ab(src_size, CV_MAKE_TYPE(depth, 2));
    Mat l_ab[] = { l, ab };
    int from_to[] = { 0,0, 1,1, 2,2 };
    mixChannels(&src_lab, 1, l_ab, 2, from_to, 3);
@@ -157,9 +230,76 @@ static void fastNlMeansDenoisingMultiCheckPreconditions(
        }
 }
 template<typename ST, typename IT, typename UIT, typename D>
 static void fastNlMeansDenoisingMulti_( const std::vector<Mat>& srcImgs, Mat& dst,
                                        int imgToDenoiseIndex, int temporalWindowSize,
                                        const std::vector<float>& h,
                                        int templateWindowSize, int searchWindowSize)
 {
    int hn = (int)h.size();
    switch (srcImgs[0].type())
    {
        case CV_8U:
            parallel_for_(cv::Range(0, srcImgs[0].rows),
                          FastNlMeansMultiDenoisingInvoker<uchar, IT, UIT, D, int>(
                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
                              dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        case CV_8UC2:
            if (hn == 1)
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                  dst, templateWindowSize, searchWindowSize, &h[0]));
            else
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                  dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        case CV_8UC3:
            if (hn == 1)
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                  dst, templateWindowSize, searchWindowSize, &h[0]));
            else
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                  dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        case CV_8UC4:
            if (hn == 1)
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                  dst, templateWindowSize, searchWindowSize, &h[0]));
            else
                parallel_for_(cv::Range(0, srcImgs[0].rows),
                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                  dst, templateWindowSize, searchWindowSize, &h[0]));
            break;
        default:
            CV_Error(Error::StsBadArg,
                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
    }
 }
 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                    int imgToDenoiseIndex, int temporalWindowSize,
                                    float h, int templateWindowSize, int searchWindowSize)
 {
    fastNlMeansDenoisingMulti(_srcImgs, _dst, imgToDenoiseIndex, temporalWindowSize,
                              std::vector<float>(1, h), templateWindowSize, searchWindowSize);
 }
 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                    int imgToDenoiseIndex, int temporalWindowSize,
                                    const std::vector<float>& h,
                                    int templateWindowSize, int searchWindowSize, int normType)
 {
    std::vector<Mat> srcImgs;
    _srcImgs.getMatVector(srcImgs);
@@ -168,32 +308,52 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
        srcImgs, imgToDenoiseIndex,
        temporalWindowSize, templateWindowSize, searchWindowSize);
    int hn = (int)h.size();
    int type = srcImgs[0].type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    CV_Assert(hn == 1 || hn == cn);
    _dst.create(srcImgs[0].size(), srcImgs[0].type());
    Mat dst = _dst.getMat();
-    switch (srcImgs[0].type())
+    switch (normType) {
-    {
+        case NORM_L2:
-        case CV_8U:
+            switch (depth) {
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                case CV_8U:
-                FastNlMeansMultiDenoisingInvoker<uchar>(
+                    fastNlMeansDenoisingMulti_<uchar, int, unsigned,
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                               DistSquared>(srcImgs, dst,
-                    dst, templateWindowSize, searchWindowSize, h));
+                                                            imgToDenoiseIndex, temporalWindowSize,
                                                            h,
                                                            templateWindowSize, searchWindowSize);
                    break;
                default:
                    CV_Error(Error::StsBadArg,
                             "Unsupported depth! Only CV_8U is supported for NORM_L2");
            }
            break;
-        case CV_8UC2:
+        case NORM_L1:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
+            switch (depth) {
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
+                case CV_8U:
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    fastNlMeansDenoisingMulti_<uchar, int, unsigned,
-                    dst, templateWindowSize, searchWindowSize, h));
+                                               DistAbs>(srcImgs, dst,
-            break;
+                                                        imgToDenoiseIndex, temporalWindowSize,
-        case CV_8UC3:
+                                                        h,
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                                                        templateWindowSize, searchWindowSize);
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
+                    break;
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                case CV_16U:
-                    dst, templateWindowSize, searchWindowSize, h));
+                    fastNlMeansDenoisingMulti_<ushort, int64, uint64,
                                               DistAbs>(srcImgs, dst,
                                                        imgToDenoiseIndex, temporalWindowSize,
                                                        h,
                                                        templateWindowSize, searchWindowSize);
                    break;
                default:
                    CV_Error(Error::StsBadArg,
                             "Unsupported depth! Only CV_8U and CV_16U are supported for NORM_L1");
            }
            break;
        default:
            CV_Error(Error::StsBadArg,
-                "Unsupported matrix format! Only uchar, Vec2b, Vec3b are supported");
+                     "Unsupported norm type! Only NORM_L2 and NORM_L1 are supported");
    }
 }
@@ -212,9 +372,10 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
    _dst.create(srcImgs[0].size(), srcImgs[0].type());
    Mat dst = _dst.getMat();
    int type = srcImgs[0].type(), depth = CV_MAT_DEPTH(type);
    int src_imgs_size = static_cast<int>(srcImgs.size());
-    if (srcImgs[0].type() != CV_8UC3)
+    if (type != CV_8UC3)
    {
        CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3!");
        return;
@@ -228,9 +389,9 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
    std::vector<Mat> ab(src_imgs_size);
    for (int i = 0; i < src_imgs_size; i++)
    {
-        src_lab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC3);
+        src_lab[i] = Mat::zeros(srcImgs[0].size(), type);
-        l[i] = Mat::zeros(srcImgs[0].size(), CV_8UC1);
+        l[i] = Mat::zeros(srcImgs[0].size(), CV_MAKE_TYPE(depth, 1));
-        ab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC2);
+        ab[i] = Mat::zeros(srcImgs[0].size(), CV_MAKE_TYPE(depth, 2));
        cvtColor(srcImgs[i], src_lab[i], COLOR_LBGR2Lab);
        Mat l_ab[] = { l[i], ab[i] };
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -50,13 +50,13 @@
 using namespace cv;
-template <typename T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansDenoisingInvoker :
        public ParallelLoopBody
 {
 public:
    FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
-        int template_window_size, int search_window_size, const float h);
+        int template_window_size, int search_window_size, const float *h);
    void operator() (const Range& range) const;
@@ -75,9 +75,9 @@ private:
    int template_window_half_size_;
    int search_window_half_size_;
-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
    int almost_template_window_size_sq_bin_shift_;
-    std::vector<int> almost_dist2weight_;
+    std::vector<WT> almost_dist2weight_;
    void calcDistSumsForFirstElementInRow(
        int i, Array2d<int>& dist_sums,
@@ -99,15 +99,15 @@ inline int getNearestPowerOf2(int value)
    return p;
 }
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
+FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansDenoisingInvoker(
    const Mat& src, Mat& dst,
    int template_window_size,
    int search_window_size,
-    const float h) :
+    const float *h) :
    src_(src), dst_(dst)
 {
-    CV_Assert(src.channels() == sizeof(T)); //T is Vec1b or Vec2b or Vec3b
+    CV_Assert(src.channels() == pixelInfo<T>::channels);
    template_window_half_size_ = template_window_size / 2;
    search_window_half_size_   = search_window_size   / 2;
@@ -117,8 +117,10 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
    border_size_ = search_window_half_size_ + template_window_half_size_;
    copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT);
-    const int max_estimate_sum_value = search_window_size_ * search_window_size_ * 255;
+    const IT max_estimate_sum_value =
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+        (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
                                          pixelInfo<WT>::sampleMax());
    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -127,30 +129,24 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
    double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
-    int max_dist = 255 * 255 * sizeof(T);
+    int max_dist = D::template maxDist<T>();
    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
    almost_dist2weight_.resize(almost_max_dist);
    const double WEIGHT_THRESHOLD = 0.001;
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        almost_dist2weight_[almost_dist] =
-
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
            weight = 0;
        almost_dist2weight_[almost_dist] = weight;
    }
    CV_Assert(almost_dist2weight_[0] == fixed_point_mult_);
    // additional optimization init end
    if (dst_.empty())
        dst_ = Mat::zeros(src_.size(), src_.type());
 }
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
+void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
    int row_from = range.start;
    int row_to = range.end - 1;
@@ -215,7 +211,7 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                            dist_sums_row[x] -= col_dist_sums_row[x];
                            int bx = start_bx + x;
-                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
+                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
                            dist_sums_row[x] += col_dist_sums_row[x];
                            up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -227,9 +223,11 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
            }
            // calc weights
-            int estimation[3], weights_sum = 0;
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                estimation[channel_num] = 0;
            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
                weights_sum[channel_num] = 0;
            for (int y = 0; y < search_window_size_; y++)
            {
@@ -238,24 +236,21 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                for (int x = 0; x < search_window_size_; x++)
                {
                    int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
-                    int weight = almost_dist2weight_[almostAvgDist];
+                    WT weight = almost_dist2weight_[almostAvgDist];
                    weights_sum += weight;
                    T p = cur_row_ptr[border_size_ + search_window_x + x];
-                    incWithWeight(estimation, weight, p);
+                    incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                }
            }
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum/2) / weights_sum;
+                                                                                      weights_sum);
-
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
        }
    }
 }
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
    int i,
    Array2d<int>& dist_sums,
    Array3d<int>& col_dist_sums,
@@ -276,7 +271,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
            for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                {
-                    int dist = calcDist<T>(extended_src_,
+                    int dist = D::template calcDist<T>(extended_src_,
                        border_size_ + i + ty, border_size_ + j + tx,
                        border_size_ + start_y + ty, border_size_ + start_x + tx);
@@ -288,8 +283,8 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
        }
 }
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
    int i, int j, int first_col_num,
    Array2d<int>& dist_sums,
    Array3d<int>& col_dist_sums,
@@ -312,7 +307,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
            int by = start_by + y;
            int bx = start_bx + x;
            for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
-                col_dist_sums[new_last_col_num][y][x] += calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
+                col_dist_sums[new_last_col_num][y][x] += D::template calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
            dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x];
            up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x];
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -44,118 +44,438 @@
 using namespace cv;
-template <typename T> static inline int calcDist(const T a, const T b);
+template <typename T> struct pixelInfo_
 template <> inline int calcDist(const uchar a, const uchar b)
 {
-    return (a-b) * (a-b);
+    static const int channels = 1;
    typedef T sampleType;
 };
 template <typename ET, int n> struct pixelInfo_<Vec<ET, n> >
 {
    static const int channels = n;
    typedef ET sampleType;
 };
 template <typename T> struct pixelInfo: public pixelInfo_<T>
 {
    using typename pixelInfo_<T>::sampleType;
    static inline sampleType sampleMax()
    {
        return std::numeric_limits<sampleType>::max();
    }
    static inline sampleType sampleMin()
    {
        return std::numeric_limits<sampleType>::min();
    }
    static inline size_t sampleBytes()
    {
        return sizeof(sampleType);
    }
    static inline size_t sampleBits()
    {
        return 8*sampleBytes();
    }
 };
 class DistAbs
 {
    template <typename T> struct calcDist_
    {
        static inline int f(const T a, const T b)
        {
            return std::abs((int)(a-b));
        }
    };
    template <typename ET> struct calcDist_<Vec<ET, 2> >
    {
        static inline int f(const Vec<ET, 2> a, const Vec<ET, 2> b)
        {
            return std::abs((int)(a[0]-b[0])) + std::abs((int)(a[1]-b[1]));
        }
    };
    template <typename ET> struct calcDist_<Vec<ET, 3> >
    {
        static inline int f(const Vec<ET, 3> a, const Vec<ET, 3> b)
        {
            return
                std::abs((int)(a[0]-b[0])) +
                std::abs((int)(a[1]-b[1])) +
                std::abs((int)(a[2]-b[2]));
        }
    };
    template <typename ET> struct calcDist_<Vec<ET, 4> >
    {
        static inline int f(const Vec<ET, 4> a, const Vec<ET, 4> b)
        {
            return
                std::abs((int)(a[0]-b[0])) +
                std::abs((int)(a[1]-b[1])) +
                std::abs((int)(a[2]-b[2])) +
                std::abs((int)(a[3]-b[3]));
        }
    };
    template <typename T, typename WT> struct calcWeight_
    {
        static inline WT f(double dist, const float *h, WT fixed_point_mult)
        {
            double w = std::exp(-dist*dist / (h[0]*h[0] * pixelInfo<T>::channels));
            if (std::isnan(w)) w = 1.0; // Handle h = 0.0
            static const double WEIGHT_THRESHOLD = 0.001;
            WT weight = (WT)round(fixed_point_mult * w);
            if (weight < WEIGHT_THRESHOLD * fixed_point_mult) weight = 0;
            return weight;
        }
    };
    template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
    {
        static inline Vec<ET, n> f(double dist, const float *h, ET fixed_point_mult)
        {
            Vec<ET, n> res;
            for (int i=0; i<n; i++)
                res[i] = calcWeight<T, ET>(dist, &h[i], fixed_point_mult);
            return res;
        }
    };
 public:
    template <typename T> static inline int calcDist(const T a, const T b)
    {
        return calcDist_<T>::f(a, b);
    }
    template <typename T>
    static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
    {
        const T a = m.at<T>(i1, j1);
        const T b = m.at<T>(i2, j2);
        return calcDist<T>(a,b);
    }
    template <typename T>
    static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
    {
        return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
    };
    template <typename T, typename WT>
    static inline WT calcWeight(double dist, const float *h,
                                typename pixelInfo<WT>::sampleType fixed_point_mult)
    {
        return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
    }
    template <typename T>
    static inline int maxDist()
    {
        return (int)pixelInfo<T>::sampleMax() * pixelInfo<T>::channels;
    }
 };
 class DistSquared
 {
    template <typename T> struct calcDist_
    {
        static inline int f(const T a, const T b)
        {
            return (int)(a-b) * (int)(a-b);
        }
    };
    template <typename ET> struct calcDist_<Vec<ET, 2> >
    {
        static inline int f(const Vec<ET, 2> a, const Vec<ET, 2> b)
        {
            return (int)(a[0]-b[0])*(int)(a[0]-b[0]) + (int)(a[1]-b[1])*(int)(a[1]-b[1]);
        }
    };
    template <typename ET> struct calcDist_<Vec<ET, 3> >
    {
        static inline int f(const Vec<ET, 3> a, const Vec<ET, 3> b)
        {
            return
                (int)(a[0]-b[0])*(int)(a[0]-b[0]) +
                (int)(a[1]-b[1])*(int)(a[1]-b[1]) +
                (int)(a[2]-b[2])*(int)(a[2]-b[2]);
        }
    };
    template <typename ET> struct calcDist_<Vec<ET, 4> >
    {
        static inline int f(const Vec<ET, 4> a, const Vec<ET, 4> b)
        {
            return
                (int)(a[0]-b[0])*(int)(a[0]-b[0]) +
                (int)(a[1]-b[1])*(int)(a[1]-b[1]) +
                (int)(a[2]-b[2])*(int)(a[2]-b[2]) +
                (int)(a[3]-b[3])*(int)(a[3]-b[3]);
        }
    };
    template <typename T> struct calcUpDownDist_
    {
        static inline int f(T a_up, T a_down, T b_up, T b_down)
        {
            int A = a_down - b_down;
            int B = a_up - b_up;
            return (A-B)*(A+B);
        }
    };
    template <typename ET, int n> struct calcUpDownDist_<Vec<ET, n> >
    {
    private:
        typedef Vec<ET, n> T;
    public:
        static inline int f(T a_up, T a_down, T b_up, T b_down)
        {
            return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
        }
    };
    template <typename T, typename WT> struct calcWeight_
    {
        static inline WT f(double dist, const float *h, WT fixed_point_mult)
        {
            double w = std::exp(-dist / (h[0]*h[0] * pixelInfo<T>::channels));
            if (std::isnan(w)) w = 1.0; // Handle h = 0.0
            static const double WEIGHT_THRESHOLD = 0.001;
            WT weight = (WT)round(fixed_point_mult * w);
            if (weight < WEIGHT_THRESHOLD * fixed_point_mult) weight = 0;
            return weight;
        }
    };
    template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
    {
        static inline Vec<ET, n> f(double dist, const float *h, ET fixed_point_mult)
        {
            Vec<ET, n> res;
            for (int i=0; i<n; i++)
                res[i] = calcWeight<T, ET>(dist, &h[i], fixed_point_mult);
            return res;
        }
    };
 public:
    template <typename T> static inline int calcDist(const T a, const T b)
    {
        return calcDist_<T>::f(a, b);
    }
    template <typename T>
    static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
    {
        const T a = m.at<T>(i1, j1);
        const T b = m.at<T>(i2, j2);
        return calcDist<T>(a,b);
    }
    template <typename T>
    static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
    {
        return calcUpDownDist_<T>::f(a_up, a_down, b_up, b_down);
    };
    template <typename T, typename WT>
    static inline WT calcWeight(double dist, const float *h,
                                typename pixelInfo<WT>::sampleType fixed_point_mult)
    {
        return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
    }
    template <typename T>
    static inline int maxDist()
    {
        return (int)pixelInfo<T>::sampleMax() * (int)pixelInfo<T>::sampleMax() *
            pixelInfo<T>::channels;
    }
 };
 template <typename T, typename IT, typename WT> struct incWithWeight_
 {
    static inline void f(IT* estimation, IT* weights_sum, WT weight, T p)
    {
        estimation[0] += (IT)weight * p;
        weights_sum[0] += (IT)weight;
    }
 };
 template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 2>, IT, WT>
 {
    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 2> p)
    {
        estimation[0] += (IT)weight * p[0];
        estimation[1] += (IT)weight * p[1];
        weights_sum[0] += (IT)weight;
    }
 };
 template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 3>, IT, WT>
 {
    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 3> p)
    {
        estimation[0] += (IT)weight * p[0];
        estimation[1] += (IT)weight * p[1];
        estimation[2] += (IT)weight * p[2];
        weights_sum[0] += (IT)weight;
    }
 };
 template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 4>, IT, WT>
 {
    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 4> p)
    {
        estimation[0] += (IT)weight * p[0];
        estimation[1] += (IT)weight * p[1];
        estimation[2] += (IT)weight * p[2];
        estimation[3] += (IT)weight * p[3];
        weights_sum[0] += (IT)weight;
    }
 };
 template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 2>, IT, Vec<EW, 2> >
 {
    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 2> weight, Vec<ET, 2> p)
    {
        estimation[0] += (IT)weight[0] * p[0];
        estimation[1] += (IT)weight[1] * p[1];
        weights_sum[0] += (IT)weight[0];
        weights_sum[1] += (IT)weight[1];
    }
 };
 template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 3>, IT, Vec<EW, 3> >
 {
    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 3> weight, Vec<ET, 3> p)
    {
        estimation[0] += (IT)weight[0] * p[0];
        estimation[1] += (IT)weight[1] * p[1];
        estimation[2] += (IT)weight[2] * p[2];
        weights_sum[0] += (IT)weight[0];
        weights_sum[1] += (IT)weight[1];
        weights_sum[2] += (IT)weight[2];
    }
 };
 template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 4>, IT, Vec<EW, 4> >
 {
    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 4> weight, Vec<ET, 4> p)
    {
        estimation[0] += (IT)weight[0] * p[0];
        estimation[1] += (IT)weight[1] * p[1];
        estimation[2] += (IT)weight[2] * p[2];
        estimation[3] += (IT)weight[3] * p[3];
        weights_sum[0] += (IT)weight[0];
        weights_sum[1] += (IT)weight[1];
        weights_sum[2] += (IT)weight[2];
        weights_sum[3] += (IT)weight[3];
    }
 };
 template <typename T, typename IT, typename WT>
 static inline void incWithWeight(IT* estimation, IT* weights_sum, WT weight, T p)
 {
    return incWithWeight_<T, IT, WT>::f(estimation, weights_sum, weight, p);
 }
-template <> inline int calcDist(const Vec2b a, const Vec2b b)
+template <typename IT, typename UIT, int nc, int nw> struct divByWeightsSum_
 {
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]);
+    static inline void f(IT* estimation, IT* weights_sum);
 };
 template <typename IT, typename UIT> struct divByWeightsSum_<IT, UIT, 1, 1>
 {
    static inline void f(IT* estimation, IT* weights_sum)
    {
        estimation[0] = (static_cast<UIT>(estimation[0]) + weights_sum[0]/2) / weights_sum[0];
    }
 };
 template <typename IT, typename UIT, int n> struct divByWeightsSum_<IT, UIT, n, 1>
 {
    static inline void f(IT* estimation, IT* weights_sum)
    {
        for (size_t i = 0; i < n; i++)
            estimation[i] = (static_cast<UIT>(estimation[i]) + weights_sum[0]/2) / weights_sum[0];
    }
 };
 template <typename IT, typename UIT, int n> struct divByWeightsSum_<IT, UIT, n, n>
 {
    static inline void f(IT* estimation, IT* weights_sum)
    {
        for (size_t i = 0; i < n; i++)
            estimation[i] = (static_cast<UIT>(estimation[i]) + weights_sum[i]/2) / weights_sum[i];
    }
 };
 template <typename IT, typename UIT, int nc, int nw>
 static inline void divByWeightsSum(IT* estimation, IT* weights_sum)
 {
    return divByWeightsSum_<IT, UIT, nc, nw>::f(estimation, weights_sum);
 }
-template <> inline int calcDist(const Vec3b a, const Vec3b b)
+template <typename T, typename IT> struct saturateCastFromArray_
 {
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]) + (a[2]-b[2])*(a[2]-b[2]);
+    static inline T f(IT* estimation)
-}
+    {
        return saturate_cast<T>(estimation[0]);
    }
 };
-template <typename T> static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 2>, IT>
 {
-    const T a = m.at<T>(i1, j1);
+    static inline Vec<ET, 2> f(IT* estimation)
-    const T b = m.at<T>(i2, j2);
+    {
-    return calcDist<T>(a,b);
+        Vec<ET, 2> res;
-}
+        res[0] = saturate_cast<ET>(estimation[0]);
        res[1] = saturate_cast<ET>(estimation[1]);
        return res;
    }
 };
-template <typename T> static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 3>, IT>
 {
-    return calcDist(a_down, b_down) - calcDist(a_up, b_up);
+    static inline Vec<ET, 3> f(IT* estimation)
-}
+    {
        Vec<ET, 3> res;
        res[0] = saturate_cast<ET>(estimation[0]);
        res[1] = saturate_cast<ET>(estimation[1]);
        res[2] = saturate_cast<ET>(estimation[2]);
        return res;
    }
 };
-template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar  b_up, uchar b_down)
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 4>, IT>
 {
-    int A = a_down - b_down;
+    static inline Vec<ET, 4> f(IT* estimation)
-    int B = a_up - b_up;
+    {
-    return (A-B)*(A+B);
+        Vec<ET, 4> res;
-}
+        res[0] = saturate_cast<ET>(estimation[0]);
        res[1] = saturate_cast<ET>(estimation[1]);
        res[2] = saturate_cast<ET>(estimation[2]);
        res[3] = saturate_cast<ET>(estimation[3]);
        return res;
    }
 };
-template <typename T> static inline void incWithWeight(int* estimation, int weight, T p);
+template <typename T, typename IT> static inline T saturateCastFromArray(IT* estimation)
 template <> inline void incWithWeight(int* estimation, int weight, uchar p)
 {
-    estimation[0] += weight * p;
+    return saturateCastFromArray_<T, IT>::f(estimation);
 }
 template <> inline void incWithWeight(int* estimation, int weight, Vec2b p)
 {
    estimation[0] += weight * p[0];
    estimation[1] += weight * p[1];
 }
 template <> inline void incWithWeight(int* estimation, int weight, Vec3b p)
 {
    estimation[0] += weight * p[0];
    estimation[1] += weight * p[1];
    estimation[2] += weight * p[2];
 }
 template <> inline void incWithWeight(int* estimation, int weight, int p)
 {
    estimation[0] += weight * p;
 }
 template <> inline void incWithWeight(int* estimation, int weight, Vec2i p)
 {
    estimation[0] += weight * p[0];
    estimation[1] += weight * p[1];
 }
 template <> inline void incWithWeight(int* estimation, int weight, Vec3i p)
 {
    estimation[0] += weight * p[0];
    estimation[1] += weight * p[1];
    estimation[2] += weight * p[2];
 }
 template <typename T> static inline T saturateCastFromArray(int* estimation);
 template <> inline uchar saturateCastFromArray(int* estimation)
 {
    return saturate_cast<uchar>(estimation[0]);
 }
 template <> inline Vec2b saturateCastFromArray(int* estimation)
 {
    Vec2b res;
    res[0] = saturate_cast<uchar>(estimation[0]);
    res[1] = saturate_cast<uchar>(estimation[1]);
    return res;
 }
 template <> inline Vec3b saturateCastFromArray(int* estimation)
 {
    Vec3b res;
    res[0] = saturate_cast<uchar>(estimation[0]);
    res[1] = saturate_cast<uchar>(estimation[1]);
    res[2] = saturate_cast<uchar>(estimation[2]);
    return res;
 }
 template <> inline int saturateCastFromArray(int* estimation)
 {
    return estimation[0];
 }
 template <> inline Vec2i saturateCastFromArray(int* estimation)
 {
    estimation[1] = 0;
    return Vec2i(estimation);
 }
 template <> inline Vec3i saturateCastFromArray(int* estimation)
 {
    return Vec3i(estimation);
 }
 #endif
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -28,12 +28,16 @@ static int divUp(int a, int b)
    return (a + b - 1) / b;
 }
-template <typename FT>
+template <typename FT, typename ST, typename WT>
-static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
                                      int searchWindowSize, int templateWindowSize,
                                      const FT *h, int hn, int cn, int normType,
                                      int & almostTemplateWindowSizeSqBinShift)
 {
-    const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255;
+    const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
-    int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue;
+        std::numeric_limits<ST>::max();
    int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
                                           std::numeric_limits<int>::max());
    int depth = DataType<FT>::depth;
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
@@ -48,33 +52,44 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
    FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
    const FT WEIGHT_THRESHOLD = 1e-3f;
-    int maxDist = 255 * 255 * cn;
+    int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
        std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
    int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
-    FT den = 1.0f / (h * h * cn);
+    FT den[4];
    CV_Assert(hn > 0 && hn <= 4);
    for (int i=0; i<hn; i++)
        den[i] = 1.0f / (h[i] * h[i] * cn);
-    almostDist2Weight.create(1, almostMaxDist, CV_32SC1);
+    almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
    char buf[40];
    ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
-                  format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth),
+                  format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                         " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
                         ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
                         ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                         normType == NORM_L1 ? " -D ABS" : ""));
    if (k.empty())
        return false;
    k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
-           almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD);
+           almostDist2ActualDistMultiplier, fixedPointMult,
           ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
    size_t globalsize[1] = { almostMaxDist };
    return k.run(1, globalsize, NULL, false);
 }
-static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
+static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
-                                     int templateWindowSize, int searchWindowSize)
+                                     int templateWindowSize, int searchWindowSize, int normType)
 {
-    int type = _src.type(), cn = CV_MAT_CN(type);
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
    Size size = _src.size();
-    if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
+    if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
                             (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
        return false;
    int templateWindowHalfWize = templateWindowSize / 2;
@@ -84,33 +99,68 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
    int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
    int almostTemplateWindowSizeSqBinShift = -1;
-    char cvt[2][40];
+    char buf[4][40];
    String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
-                         " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
+                         " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
                         " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
                         " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                         " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
-                         " -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s",
+                         " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
-                         templateWindowSize, searchWindowSize, ocl::typeToStr(type),
+                         templateWindowSize, searchWindowSize,
-                         ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize,
+                         ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
-                         templateWindowHalfWize, searchWindowHalfSize,
+                         ocl::typeToStr(CV_32SC(hn)),
-                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
-                         ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
+                         format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
                         depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
                         format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
                         depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
                         format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
                         depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
                         format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
                         BLOCK_COLS, BLOCK_ROWS,
                         ctaSize, templateWindowHalfWize, searchWindowHalfSize,
                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
                         (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
                         ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
                         normType == NORM_L1 ? " -D ABS" : "");
    ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
    if (k.empty())
        return false;
    UMat almostDist2Weight;
-    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn,
+    if ((depth == CV_8U &&
-                                   almostTemplateWindowSizeSqBinShift))
+         !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
                                                       searchWindowSize, templateWindowSize,
                                                       h, hn, cn, normType,
                                                       almostTemplateWindowSizeSqBinShift)) ||
        (depth == CV_16U &&
         !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
                                                          searchWindowSize, templateWindowSize,
                                                          h, hn, cn, normType,
                                                          almostTemplateWindowSizeSqBinShift)))
        return false;
    CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
    UMat srcex;
    int borderSize = searchWindowHalfSize + templateWindowHalfWize;
-    copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+    if (cn == 3) {
        srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
        UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
        int from_to[] = { 0,0, 1,1, 2,2 };
        mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
        copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
                       BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
    }
    else
        copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
    _dst.create(size, type);
-    UMat dst = _dst.getUMat();
+    UMat dst;
    if (cn == 3)
        dst.create(size, CV_MAKE_TYPE(depth, 4));
    else
        dst = _dst.getUMat();
    int searchWindowSizeSq = searchWindowSize * searchWindowSize;
    Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
@@ -123,7 +173,14 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
           ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
    size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
-    return k.run(2, globalsize, localsize, false);
+    if (!k.run(2, globalsize, localsize, false)) return false;
    if (cn == 3) {
        int from_to[] = { 0,0, 1,1, 2,2 };
        mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
    }
    return true;
 }
 static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -50,14 +50,14 @@
 using namespace cv;
-template <typename T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansMultiDenoisingInvoker :
        ParallelLoopBody
 {
 public:
    FastNlMeansMultiDenoisingInvoker(const std::vector<Mat>& srcImgs, int imgToDenoiseIndex,
                                     int temporalWindowSize, Mat& dst, int template_window_size,
-                                     int search_window_size, const float h);
+                                     int search_window_size, const float *h);
    void operator() (const Range& range) const;
@@ -81,9 +81,9 @@ private:
    int search_window_half_size_;
    int temporal_window_half_size_;
-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
    int almost_template_window_size_sq_bin_shift;
-    std::vector<int> almost_dist2weight;
+    std::vector<WT> almost_dist2weight;
    void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
                                          Array4d<int>& col_dist_sums,
@@ -94,19 +94,19 @@ private:
                                          Array4d<int>& up_col_dist_sums) const;
 };
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
+FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansMultiDenoisingInvoker(
    const std::vector<Mat>& srcImgs,
    int imgToDenoiseIndex,
    int temporalWindowSize,
    cv::Mat& dst,
    int template_window_size,
    int search_window_size,
-    const float h) :
+    const float *h) :
        dst_(dst), extended_srcs_(srcImgs.size())
 {
    CV_Assert(srcImgs.size() > 0);
-    CV_Assert(srcImgs[0].channels() == sizeof(T));
+    CV_Assert(srcImgs[0].channels() == pixelInfo<T>::channels);
    rows_ = srcImgs[0].rows;
    cols_ = srcImgs[0].cols;
@@ -125,8 +125,10 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
            border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT);
    main_extended_src_ = extended_srcs_[temporal_window_half_size_];
-    const int max_estimate_sum_value = temporal_window_size_ * search_window_size_ * search_window_size_ * 255;
+    const IT max_estimate_sum_value =
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+        (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
                                          pixelInfo<WT>::sampleMax());
    // precalc weight for every possible l2 dist between blocks
    // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -138,30 +140,24 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
    int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
    double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
-    int max_dist = 255 * 255 * sizeof(T);
+    int max_dist = D::template maxDist<T>();
-    int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
+    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
    almost_dist2weight.resize(almost_max_dist);
    const double WEIGHT_THRESHOLD = 0.001;
    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
    {
        double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        almost_dist2weight[almost_dist] =
-
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
            weight = 0;
        almost_dist2weight[almost_dist] = weight;
    }
    CV_Assert(almost_dist2weight[0] == fixed_point_mult_);
    // additional optimization init end
    if (dst_.empty())
        dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type());
 }
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
+void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
    int row_from = range.start;
    int row_to = range.end - 1;
@@ -234,7 +230,7 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                                dist_sums_row[x] -= col_dist_sums_row[x];
                                col_dist_sums_row[x] = up_col_dist_sums_row[x] +
-                                    calcUpDownDist(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
+                                    D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
                                dist_sums_row[x] += col_dist_sums_row[x];
                                up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -247,11 +243,11 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
            }
            // calc weights
-            int weights_sum = 0;
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
-
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
            int estimation[3];
            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
                estimation[channel_num] = 0;
            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
                weights_sum[channel_num] = 0;
            for (int d = 0; d < temporal_window_size_; d++)
            {
@@ -266,26 +262,22 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                    {
                        int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
-                        int weight = almost_dist2weight[almostAvgDist];
+                        WT weight =  almost_dist2weight[almostAvgDist];
                        weights_sum += weight;
                        T p = cur_row_ptr[border_size_ + search_window_x + x];
-                        incWithWeight(estimation, weight, p);
+                        incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                    }
                }
            }
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum / 2) / weights_sum;
+                                                                                      weights_sum);
-
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
        }
    }
 }
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
        int i, Array3d<int>& dist_sums, Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
    int j = 0;
@@ -310,7 +302,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
                {
                    for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                    {
-                        int dist = calcDist<T>(
+                        int dist = D::template calcDist<T>(
                                    main_extended_src_.at<T>(border_size_ + i + ty, border_size_ + j + tx),
                                    cur_extended_src.at<T>(border_size_ + start_y + ty, border_size_ + start_x + tx));
@@ -325,8 +317,8 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
    }
 }
-template <class T>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
    int i, int j, int first_col_num, Array3d<int>& dist_sums,
    Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
@@ -353,7 +345,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRo
                int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
                for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                {
-                    *col_dist_sums_ptr += calcDist<T>(
+                    *col_dist_sums_ptr += D::template calcDist<T>(
                                main_extended_src_.at<T>(ay + ty, ax),
                                cur_extended_src.at<T>(by + ty, bx));
                }
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -20,21 +20,23 @@
 #ifdef OP_CALC_WEIGHTS
-__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist,
+__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,
                                    FT almostDist2ActualDistMultiplier, int fixedPointMult,
-                                    FT den, FT WEIGHT_THRESHOLD)
+                                    w_t den, FT WEIGHT_THRESHOLD)
 {
    int almostDist = get_global_id(0);
    if (almostDist < almostMaxDist)
    {
        FT dist = almostDist * almostDist2ActualDistMultiplier;
-        int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
+#ifdef ABS
-
+        w_t w = exp((w_t)(-dist*dist) * den);
-        if (weight < WEIGHT_THRESHOLD * fixedPointMult)
+#else
-            weight = 0;
+        w_t w = exp((w_t)(-dist) * den);
-
+#endif
-        almostDist2Weight[almostDist] = weight;
+        wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));
        almostDist2Weight[almostDist] =
            weight < (wlut_t)(WEIGHT_THRESHOLD * fixedPointMult) ? (wlut_t)0 : weight;
    }
 }
@@ -44,21 +46,35 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
 #define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)
-inline int calcDist(uchar_t a, uchar_t b)
+inline int calcDist(pixel_t a, pixel_t b)
 {
 #ifdef ABS
    int_t retval = convert_int_t(abs_diff(a, b));
 #else
    int_t diff = convert_int_t(a) - convert_int_t(b);
    int_t retval = diff * diff;
 #endif
 #if cn == 1
    return retval;
 #elif cn == 2
    return retval.x + retval.y;
 #elif cn == 3
    return retval.x + retval.y + retval.z;
 #elif cn == 4
    return retval.x + retval.y + retval.z + retval.w;
 #else
-#error "cn should be either 1 or 2"
+#error "cn should be either 1, 2, 3 or 4"
 #endif
 }
-inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t)
+#ifdef ABS
 inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
 {
    return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
 }
 #else
 inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
 {
    int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);
    int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);
@@ -68,10 +84,15 @@ inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_v
    return retval;
 #elif cn == 2
    return retval.x + retval.y;
 #elif cn == 3
    return retval.x + retval.y + retval.z;
 #elif cn == 4
    return retval.x + retval.y + retval.z + retval.w;
 #else
-#error "cn should be either 1 or 2"
+#error "cn should be either 1, 2, 3 or 4"
 #endif
 }
 #endif
 #define COND if (x == 0 && y == 0)
@@ -87,9 +108,9 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
    {
        int dist = 0, value;
-        __global const uchar_t * src_template = (__global const uchar_t *)(src +
+        __global const pixel_t * src_template = (__global const pixel_t *)(src +
-            mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
+            mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
-        __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
        __global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
        #pragma unroll
@@ -107,8 +128,8 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
                dist += value;
            }
-            src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
+            src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
-            src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
+            src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
        }
        #pragma unroll
@@ -130,9 +151,9 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
    for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
    {
-        __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
-        __global const uchar_t * src_template = (__global const uchar_t *)(src +
+        __global const pixel_t * src_template = (__global const pixel_t *)(src +
-            mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
+            mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
        __global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
        int col_dist = 0;
@@ -142,8 +163,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
        {
            col_dist += calcDist(src_current[0], src_template[0]);
-            src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
+            src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
-            src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
+            src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
        }
        dists[i] += col_dist - col_dists_current[first];
@@ -160,8 +181,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
    int sy_up = y - TEMPLATE_SIZE2 - 1;
    int sy_down = y + TEMPLATE_SIZE2;
-    uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
+    pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));
-    uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
+    pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));
    sx -= SEARCH_SIZE2;
    sy_up -= SEARCH_SIZE2;
@@ -171,8 +192,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
    {
        int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
-        uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
+        pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));
-        uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
+        pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));
        __global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
        __global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
@@ -186,24 +207,25 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
 }
 inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
-                           __local int * dists, __global const int * almostDist2Weight,
+                           __local int * dists, __global const wlut_t * almostDist2Weight,
                           __global uchar * dst, int dst_step, int dst_offset,
-                           int y, int x, int id, __local int * weights_local,
+                           int y, int x, int id, __local weight_t * weights_local,
-                           __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
+                           __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
 {
-    int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0;
+    int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
-    int_t weighted_sum = (int_t)(0);
+    weight_t weights = (weight_t)0;
    sum_t weighted_sum = (sum_t)0;
    for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
    {
-        int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
+        int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));
-        int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index));
+        sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
        int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
-        int weight = almostDist2Weight[almostAvgDist];
+        weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);
        weights += weight;
-        weighted_sum += (int_t)(weight) * src_value;
+        weighted_sum += (sum_t)weight * src_value;
    }
    weights_local[id] = weights;
@@ -223,26 +245,27 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
    if (id == 0)
    {
-        int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
+        int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));
-        int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
+        sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
            weighted_sum_local[2] + weighted_sum_local[3];
-        int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
+        weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
-        *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / (int_t)(weights_local_0));
+        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);
    }
 }
 __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
                                   __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                                   __global const int * almostDist2Weight, __global uchar * buffer,
+                                   __global const wlut_t * almostDist2Weight, __global uchar * buffer,
                                   int almostTemplateWindowSizeSqBinShift)
 {
    int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
    int block_y = get_group_id(1);
    int id = get_local_id(0), first;
-    __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE];
+    __local int dists[SEARCH_SIZE_SQ];
-    __local int_t weighted_sum[CTA_SIZE];
+    __local weight_t weights[CTA_SIZE];
    __local sum_t weighted_sum[CTA_SIZE];
    int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
    int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -13,11 +13,11 @@
 namespace cvtest {
 namespace ocl {
-PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
+PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, int, bool, bool)
 {
-    int cn, templateWindowSize, searchWindowSize;
+    int cn, normType, templateWindowSize, searchWindowSize;
-    float h;
+    std::vector<float> h;
-    bool use_roi;
+    bool use_roi, use_image;
    TEST_DECLARE_INPUT_PARAMETER(src);
    TEST_DECLARE_OUTPUT_PARAMETER(dst);
@@ -25,29 +25,46 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
    virtual void SetUp()
    {
        cn = GET_PARAM(0);
-        use_roi = GET_PARAM(1);
+        normType = GET_PARAM(1);
        use_roi = GET_PARAM(2);
        use_image = GET_PARAM(3);
        templateWindowSize = 7;
        searchWindowSize = 21;
-        h = 3.0f;
+
        h.resize(cn);
        for (int i=0; i<cn; i++)
            h[i] = 3.0f + 0.5f*i;
    }
    virtual void generateTestData()
    {
        const int type = CV_8UC(cn);
        Mat image;
-        if (cn == 1)
+
-        {
+        if (use_image) {
-            image = readImage("denoising/lena_noised_gaussian_sigma=10.png", IMREAD_GRAYSCALE);
+            image = readImage("denoising/lena_noised_gaussian_sigma=10.png",
                                  cn == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
            ASSERT_FALSE(image.empty());
        }
-        const int type = CV_8UC(cn);
+        Size roiSize = use_image ? image.size() : randomSize(1, MAX_VALUE);
        Size roiSize = cn == 1 ? image.size() : randomSize(1, MAX_VALUE);
        Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
-        if (cn == 1)
+        if (use_image) {
-            image.copyTo(src_roi);
+            ASSERT_TRUE(cn > 0 && cn <= 4);
            if (cn == 2) {
                int from_to[] = { 0,0, 1,1 };
                src_roi.create(roiSize, type);
                mixChannels(&image, 1, &src_roi, 1, from_to, 2);
            }
            else if (cn == 4) {
                int from_to[] = { 0,0, 1,1, 2,2, 1,3};
                src_roi.create(roiSize, type);
                mixChannels(&image, 1, &src_roi, 1, from_to, 4);
            }
            else image.copyTo(src_roi);
        }
        Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
        randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255);
@@ -65,8 +82,23 @@ OCL_TEST_P(FastNlMeansDenoising, Mat)
    {
        generateTestData();
-        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
-        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
        OCL_EXPECT_MATS_NEAR(dst, 1);
    }
 }
 typedef FastNlMeansDenoisingTestBase FastNlMeansDenoising_hsep;
 OCL_TEST_P(FastNlMeansDenoising_hsep, Mat)
 {
    for (int j = 0; j < test_loop_times; j++)
    {
        generateTestData();
        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize, normType));
        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize, normType));
        OCL_EXPECT_MATS_NEAR(dst, 1);
    }
@@ -80,15 +112,21 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
    {
        generateTestData();
-        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h[0], h[0], templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h[0], h[0], templateWindowSize, searchWindowSize));
        OCL_EXPECT_MATS_NEAR(dst, 1);
    }
 }
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored, Combine(Values(3, 4), Bool()));
+                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
                                    Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising_hsep,
                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
                                    Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
                            Combine(Values(3, 4), Values((int)NORM_L2), Bool(), Values(false)));
 } } // namespace cvtest::ocl