From e178294b49ce36938fd275d3309db6110698b492 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 12 Feb 2015 15:23:28 +0100
Subject: [PATCH 01/40] Refactoring in preparation for 16-bit implementation of
 fastNlMeansDenoising

---
 modules/photo/src/denoising.cpp               |  12 +-
 .../src/fast_nlmeans_denoising_invoker.hpp    |  86 +++----
 ...fast_nlmeans_denoising_invoker_commons.hpp | 218 ++++++++++--------
 .../fast_nlmeans_multi_denoising_invoker.hpp  |  93 ++++----
 4 files changed, 222 insertions(+), 187 deletions(-)
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index a074ac136..724ea0eb0 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -65,17 +65,17 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar>(
+                FastNlMeansDenoisingInvoker<uchar, int, unsigned int>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b>(
+                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned int>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b>(
+                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned int>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         default:
@@ -175,19 +175,19 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
     {
         case CV_8U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar>(
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index b8f5a0392..2ad0189ef 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -50,7 +50,7 @@
 
 using namespace cv;
 
-template <typename T>
+template <typename T, typename IT, typename UIT>
 struct FastNlMeansDenoisingInvoker :
         public ParallelLoopBody
 {
@@ -75,20 +75,20 @@ private:
     int template_window_half_size_;
     int search_window_half_size_;
 
-    int fixed_point_mult_;
+    IT fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift_;
-    std::vector<int> almost_dist2weight_;
+    std::vector<IT> almost_dist2weight_;
 
     void calcDistSumsForFirstElementInRow(
-        int i, Array2d<int>& dist_sums,
-        Array3d<int>& col_dist_sums,
-        Array3d<int>& up_col_dist_sums) const;
+        int i, Array2d<IT>& dist_sums,
+        Array3d<IT>& col_dist_sums,
+        Array3d<IT>& up_col_dist_sums) const;
 
     void calcDistSumsForElementInFirstRow(
         int i, int j, int first_col_num,
-        Array2d<int>& dist_sums,
-        Array3d<int>& col_dist_sums,
-        Array3d<int>& up_col_dist_sums) const;
+        Array2d<IT>& dist_sums,
+        Array3d<IT>& col_dist_sums,
+        Array3d<IT>& up_col_dist_sums) const;
 };
 
 inline int getNearestPowerOf2(int value)
@@ -99,8 +99,8 @@ inline int getNearestPowerOf2(int value)
     return p;
 }
 
-template <class T>
-FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
+template <class T, typename IT, typename UIT>
+FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     const Mat& src, Mat& dst,
     int template_window_size,
     int search_window_size,
@@ -117,8 +117,8 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
     border_size_ = search_window_half_size_ + template_window_half_size_;
     copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT);
 
-    const int max_estimate_sum_value = search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value = (IT)search_window_size_ * (IT)search_window_size_ * 255;
+    fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -127,7 +127,7 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
     almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
-    int max_dist = 255 * 255 * sizeof(T);
+    IT max_dist = 255 * 255 * sizeof(T);
     int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight_.resize(almost_max_dist);
 
@@ -135,7 +135,7 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
 
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
@@ -149,21 +149,21 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
         dst_ = Mat::zeros(src_.size(), src_.type());
 }
 
-template <class T>
-void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
+template <class T, typename IT, typename UIT>
+void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
 
     // sums of cols anf rows for current pixel p
-    Array2d<int> dist_sums(search_window_size_, search_window_size_);
+    Array2d<IT> dist_sums(search_window_size_, search_window_size_);
 
     // for lazy calc optimization (sum of cols for current pixel)
-    Array3d<int> col_dist_sums(template_window_size_, search_window_size_, search_window_size_);
+    Array3d<IT> col_dist_sums(template_window_size_, search_window_size_, search_window_size_);
 
     int first_col_num = -1;
     // last elements of column sum (for each element in row)
-    Array3d<int> up_col_dist_sums(src_.cols, search_window_size_, search_window_size_);
+    Array3d<IT> up_col_dist_sums(src_.cols, search_window_size_, search_window_size_);
 
     for (int i = row_from; i <= row_to; i++)
     {
@@ -202,9 +202,9 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
 
                     for (int y = 0; y < search_window_size; y++)
                     {
-                        int * dist_sums_row = dist_sums.row_ptr(y);
-                        int * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y);
-                        int * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y);
+                        IT * dist_sums_row = dist_sums.row_ptr(y);
+                        IT * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y);
+                        IT * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y);
 
                         const T * b_up_ptr = extended_src_.ptr<T>(start_by - template_window_half_size_ - 1 + y);
                         const T * b_down_ptr = extended_src_.ptr<T>(start_by + template_window_half_size_ + y);
@@ -215,7 +215,7 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
                             dist_sums_row[x] -= col_dist_sums_row[x];
 
                             int bx = start_bx + x;
-                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
+                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
 
                             dist_sums_row[x] += col_dist_sums_row[x];
                             up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -227,39 +227,39 @@ void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const
             }
 
             // calc weights
-            int estimation[3], weights_sum = 0;
+            IT estimation[3], weights_sum = 0;
             for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
                 estimation[channel_num] = 0;
 
             for (int y = 0; y < search_window_size_; y++)
             {
                 const T* cur_row_ptr = extended_src_.ptr<T>(border_size_ + search_window_y + y);
-                int* dist_sums_row = dist_sums.row_ptr(y);
+                IT* dist_sums_row = dist_sums.row_ptr(y);
                 for (int x = 0; x < search_window_size_; x++)
                 {
-                    int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
-                    int weight = almost_dist2weight_[almostAvgDist];
+                    int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_);
+                    IT weight = almost_dist2weight_[almostAvgDist];
                     weights_sum += weight;
 
                     T p = cur_row_ptr[border_size_ + search_window_x + x];
-                    incWithWeight(estimation, weight, p);
+                    incWithWeight<T, IT>(estimation, weight, p);
                 }
             }
 
             for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum/2) / weights_sum;
+                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum/2) / weights_sum;
 
-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
         }
     }
 }
 
-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElementInRow(
     int i,
-    Array2d<int>& dist_sums,
-    Array3d<int>& col_dist_sums,
-    Array3d<int>& up_col_dist_sums) const
+    Array2d<IT>& dist_sums,
+    Array3d<IT>& col_dist_sums,
+    Array3d<IT>& up_col_dist_sums) const
 {
     int j = 0;
 
@@ -276,7 +276,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                 {
-                    int dist = calcDist<T>(extended_src_,
+                    int dist = calcDist<T, IT>(extended_src_,
                         border_size_ + i + ty, border_size_ + j + tx,
                         border_size_ + start_y + ty, border_size_ + start_x + tx);
 
@@ -288,12 +288,12 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
         }
 }
 
-template <class T>
-inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num,
-    Array2d<int>& dist_sums,
-    Array3d<int>& col_dist_sums,
-    Array3d<int>& up_col_dist_sums) const
+    Array2d<IT>& dist_sums,
+    Array3d<IT>& col_dist_sums,
+    Array3d<IT>& up_col_dist_sums) const
 {
     int ay = border_size_ + i;
     int ax = border_size_ + j + template_window_half_size_;
@@ -312,7 +312,7 @@ inline void FastNlMeansDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
             int by = start_by + y;
             int bx = start_bx + x;
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
-                col_dist_sums[new_last_col_num][y][x] += calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
+                col_dist_sums[new_last_col_num][y][x] += calcDist<T,IT>(extended_src_, ay + ty, ax, by + ty, bx);
 
             dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x];
             up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x];
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index ab7db5d2d..e4e0a3a59 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -44,118 +44,152 @@
 
 using namespace cv;
 
-template <typename T> static inline int calcDist(const T a, const T b);
-
-template <> inline int calcDist(const uchar a, const uchar b)
+template <typename T, typename IT> struct calcDist_
 {
-    return (a-b) * (a-b);
+    static inline IT f(const T a, const T b);
+};
+
+template <typename IT> struct calcDist_<uchar, IT>
+{
+    static inline IT f(uchar a, uchar b)
+    {
+        return (IT)(a-b) * (IT)(a-b);
+    }
+};
+
+template <typename IT> struct calcDist_<Vec2b, IT>
+{
+    static inline IT f(const Vec2b a, const Vec2b b)
+    {
+        return (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) + (IT)(a[1]-b[1])*(IT)(a[1]-b[1]);
+    }
+};
+
+template <typename IT> struct calcDist_<Vec3b, IT>
+{
+    static inline IT f(const Vec3b a, const Vec3b b)
+    {
+        return
+            (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
+            (IT)(a[1]-b[1])*(IT)(a[1]-b[1]) +
+            (IT)(a[2]-b[2])*(IT)(a[2]-b[2]);
+    }
+};
+
+template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
+{
+    return calcDist_<T, IT>::f(a, b);
 }
 
-template <> inline int calcDist(const Vec2b a, const Vec2b b)
-{
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]);
-}
-
-template <> inline int calcDist(const Vec3b a, const Vec3b b)
-{
-    return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]) + (a[2]-b[2])*(a[2]-b[2]);
-}
-
-template <typename T> static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+template <typename T, typename IT>
+static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
 {
     const T a = m.at<T>(i1, j1);
     const T b = m.at<T>(i2, j2);
-    return calcDist<T>(a,b);
+    return calcDist<T, IT>(a,b);
 }
 
-template <typename T> static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+template <typename T, typename IT> struct calcUpDownDist_
 {
-    return calcDist(a_down, b_down) - calcDist(a_up, b_up);
+    static inline IT f(T a_up, T a_down, T b_up, T b_down)
+    {
+        return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+    }
+};
+
+template <typename IT> struct calcUpDownDist_<uchar, IT>
+{
+    static inline IT f(uchar a_up, uchar a_down, uchar b_up, uchar b_down)
+    {
+        IT A = a_down - b_down;
+        IT B = a_up - b_up;
+        return (A-B)*(A+B);
+    }
+};
+
+template <typename T, typename IT>
+static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+{
+    return calcUpDownDist_<T, IT>::f(a_up, a_down, b_up, b_down);
+};
+
+template <typename T, typename IT> struct incWithWeight_
+{
+    static inline void f(IT* estimation, IT weight, T p);
+};
+
+template <typename IT> struct incWithWeight_<uchar, IT>
+{
+    static inline void f(IT* estimation, IT weight, uchar p)
+    {
+        estimation[0] += weight * p;
+    }
+};
+
+template <typename IT> struct incWithWeight_<Vec2b, IT>
+{
+    static inline void f(IT* estimation, IT weight, Vec2b p)
+    {
+        estimation[0] += weight * p[0];
+        estimation[1] += weight * p[1];
+    }
+};
+
+template <typename IT> struct incWithWeight_<Vec3b, IT>
+{
+    static inline void f(IT* estimation, IT weight, Vec3b p)
+    {
+        estimation[0] += weight * p[0];
+        estimation[1] += weight * p[1];
+        estimation[2] += weight * p[2];
+    }
+};
+
+template <typename T, typename IT>
+static inline void incWithWeight(IT* estimation, IT weight, T p)
+{
+    return incWithWeight_<T, IT>::f(estimation, weight, p);
 }
 
-template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar  b_up, uchar b_down)
+template <typename T, typename IT> struct saturateCastFromArray_
 {
-    int A = a_down - b_down;
-    int B = a_up - b_up;
-    return (A-B)*(A+B);
-}
+    static inline T f(IT* estimation);
+};
 
-template <typename T> static inline void incWithWeight(int* estimation, int weight, T p);
-
-template <> inline void incWithWeight(int* estimation, int weight, uchar p)
+template <typename IT> struct saturateCastFromArray_<uchar, IT>
 {
-    estimation[0] += weight * p;
-}
+    static inline uchar f(IT* estimation)
+    {
+        return saturate_cast<uchar>(estimation[0]);
+    }
+};
 
-template <> inline void incWithWeight(int* estimation, int weight, Vec2b p)
+template <typename IT> struct saturateCastFromArray_<Vec2b, IT>
 {
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-}
+    static inline Vec2b f(IT* estimation)
+    {
+        Vec2b res;
+        res[0] = saturate_cast<uchar>(estimation[0]);
+        res[1] = saturate_cast<uchar>(estimation[1]);
+        return res;
+    }
+};
 
-template <> inline void incWithWeight(int* estimation, int weight, Vec3b p)
+template <typename IT> struct saturateCastFromArray_<Vec3b, IT>
 {
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-    estimation[2] += weight * p[2];
-}
+    static inline Vec3b f(IT* estimation)
+    {
+        Vec3b res;
+        res[0] = saturate_cast<uchar>(estimation[0]);
+        res[1] = saturate_cast<uchar>(estimation[1]);
+        res[2] = saturate_cast<uchar>(estimation[2]);
+        return res;
+    }
+};
 
-template <> inline void incWithWeight(int* estimation, int weight, int p)
+template <typename T, typename IT> static inline T saturateCastFromArray(IT* estimation)
 {
-    estimation[0] += weight * p;
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, Vec2i p)
-{
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-}
-
-template <> inline void incWithWeight(int* estimation, int weight, Vec3i p)
-{
-    estimation[0] += weight * p[0];
-    estimation[1] += weight * p[1];
-    estimation[2] += weight * p[2];
-}
-
-template <typename T> static inline T saturateCastFromArray(int* estimation);
-
-template <> inline uchar saturateCastFromArray(int* estimation)
-{
-    return saturate_cast<uchar>(estimation[0]);
-}
-
-template <> inline Vec2b saturateCastFromArray(int* estimation)
-{
-    Vec2b res;
-    res[0] = saturate_cast<uchar>(estimation[0]);
-    res[1] = saturate_cast<uchar>(estimation[1]);
-    return res;
-}
-
-template <> inline Vec3b saturateCastFromArray(int* estimation)
-{
-    Vec3b res;
-    res[0] = saturate_cast<uchar>(estimation[0]);
-    res[1] = saturate_cast<uchar>(estimation[1]);
-    res[2] = saturate_cast<uchar>(estimation[2]);
-    return res;
-}
-
-template <> inline int saturateCastFromArray(int* estimation)
-{
-    return estimation[0];
-}
-
-template <> inline Vec2i saturateCastFromArray(int* estimation)
-{
-    estimation[1] = 0;
-    return Vec2i(estimation);
-}
-
-template <> inline Vec3i saturateCastFromArray(int* estimation)
-{
-    return Vec3i(estimation);
+    return saturateCastFromArray_<T, IT>::f(estimation);
 }
 
 #endif
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 191a67127..392733c08 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -50,7 +50,7 @@
 
 using namespace cv;
 
-template <typename T>
+template <typename T, typename IT, typename UIT>
 struct FastNlMeansMultiDenoisingInvoker :
         ParallelLoopBody
 {
@@ -81,21 +81,21 @@ private:
     int search_window_half_size_;
     int temporal_window_half_size_;
 
-    int fixed_point_mult_;
+    IT fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift;
-    std::vector<int> almost_dist2weight;
+    std::vector<IT> almost_dist2weight;
 
-    void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
-                                          Array4d<int>& col_dist_sums,
-                                          Array4d<int>& up_col_dist_sums) const;
+    void calcDistSumsForFirstElementInRow(int i, Array3d<IT>& dist_sums,
+                                          Array4d<IT>& col_dist_sums,
+                                          Array4d<IT>& up_col_dist_sums) const;
 
     void calcDistSumsForElementInFirstRow(int i, int j, int first_col_num,
-                                          Array3d<int>& dist_sums, Array4d<int>& col_dist_sums,
-                                          Array4d<int>& up_col_dist_sums) const;
+                                          Array3d<IT>& dist_sums, Array4d<IT>& col_dist_sums,
+                                          Array4d<IT>& up_col_dist_sums) const;
 };
 
-template <class T>
-FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
+template <class T, typename IT, typename UIT>
+FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
     const std::vector<Mat>& srcImgs,
     int imgToDenoiseIndex,
     int temporalWindowSize,
@@ -125,8 +125,9 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
             border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT);
 
     main_extended_src_ = extended_srcs_[temporal_window_half_size_];
-    const int max_estimate_sum_value = temporal_window_size_ * search_window_size_ * search_window_size_ * 255;
-    fixed_point_mult_ = std::numeric_limits<int>::max() / max_estimate_sum_value;
+    const IT max_estimate_sum_value =
+        (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * 255;
+    fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -138,7 +139,7 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
 
-    int max_dist = 255 * 255 * sizeof(T);
+    IT max_dist = 255 * 255 * sizeof(T);
     int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight.resize(almost_max_dist);
 
@@ -146,7 +147,7 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
 
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
@@ -160,19 +161,19 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
         dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type());
 }
 
-template <class T>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
+template <class T, typename IT, typename UIT>
+void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
 
-    Array3d<int> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
+    Array3d<IT> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
 
     // for lazy calc optimization
-    Array4d<int> col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_);
+    Array4d<IT> col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_);
 
     int first_col_num = -1;
-    Array4d<int> up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_);
+    Array4d<IT> up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_);
 
     for (int i = row_from; i <= row_to; i++)
     {
@@ -216,15 +217,15 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                     for (int d = 0; d < temporal_window_size_; d++)
                     {
                         Mat cur_extended_src = extended_srcs_[d];
-                        Array2d<int> cur_dist_sums = dist_sums[d];
-                        Array2d<int> cur_col_dist_sums = col_dist_sums[first_col_num][d];
-                        Array2d<int> cur_up_col_dist_sums = up_col_dist_sums[j][d];
+                        Array2d<IT> cur_dist_sums = dist_sums[d];
+                        Array2d<IT> cur_col_dist_sums = col_dist_sums[first_col_num][d];
+                        Array2d<IT> cur_up_col_dist_sums = up_col_dist_sums[j][d];
                         for (int y = 0; y < search_window_size; y++)
                         {
-                            int* dist_sums_row = cur_dist_sums.row_ptr(y);
+                            IT* dist_sums_row = cur_dist_sums.row_ptr(y);
 
-                            int* col_dist_sums_row = cur_col_dist_sums.row_ptr(y);
-                            int* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y);
+                            IT* col_dist_sums_row = cur_col_dist_sums.row_ptr(y);
+                            IT* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y);
 
                             const T* b_up_ptr = cur_extended_src.ptr<T>(start_by - template_window_half_size_ - 1 + y);
                             const T* b_down_ptr = cur_extended_src.ptr<T>(start_by + template_window_half_size_ + y);
@@ -234,7 +235,7 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                                 dist_sums_row[x] -= col_dist_sums_row[x];
 
                                 col_dist_sums_row[x] = up_col_dist_sums_row[x] +
-                                    calcUpDownDist(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
+                                    calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
 
                                 dist_sums_row[x] += col_dist_sums_row[x];
                                 up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -247,9 +248,9 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
             }
 
             // calc weights
-            int weights_sum = 0;
+            IT weights_sum = 0;
 
-            int estimation[3];
+            IT estimation[3];
             for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
                 estimation[channel_num] = 0;
 
@@ -260,33 +261,33 @@ void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const
                 {
                     const T* cur_row_ptr = esrc_d.ptr<T>(border_size_ + search_window_y + y);
 
-                    int* dist_sums_row = dist_sums.row_ptr(d, y);
+                    IT* dist_sums_row = dist_sums.row_ptr(d, y);
 
                     for (int x = 0; x < search_window_size_; x++)
                     {
-                        int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
+                        int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift);
 
-                        int weight = almost_dist2weight[almostAvgDist];
+                        IT weight = almost_dist2weight[almostAvgDist];
                         weights_sum += weight;
 
                         T p = cur_row_ptr[border_size_ + search_window_x + x];
-                        incWithWeight(estimation, weight, p);
+                        incWithWeight<T, IT>(estimation, weight, p);
                     }
                 }
             }
 
             for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = ((unsigned)estimation[channel_num] + weights_sum / 2) / weights_sum;
+                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum; // ????
 
-            dst_.at<T>(i,j) = saturateCastFromArray<T>(estimation);
+            dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
 
         }
     }
 }
 
-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRow(
-        int i, Array3d<int>& dist_sums, Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElementInRow(
+        int i, Array3d<IT>& dist_sums, Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
 {
     int j = 0;
 
@@ -303,14 +304,14 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
                 int start_y = i + y - search_window_half_size_;
                 int start_x = j + x - search_window_half_size_;
 
-                int* dist_sums_ptr = &dist_sums[d][y][x];
-                int* col_dist_sums_ptr = &col_dist_sums[0][d][y][x];
+                IT* dist_sums_ptr = &dist_sums[d][y][x];
+                IT* col_dist_sums_ptr = &col_dist_sums[0][d][y][x];
                 int col_dist_sums_step = col_dist_sums.step_size(0);
                 for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                 {
                     for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                     {
-                        int dist = calcDist<T>(
+                        IT dist = calcDist<T, IT>(
                                     main_extended_src_.at<T>(border_size_ + i + ty, border_size_ + j + tx),
                                     cur_extended_src.at<T>(border_size_ + start_y + ty, border_size_ + start_x + tx));
 
@@ -325,10 +326,10 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForFirstElementInRo
     }
 }
 
-template <class T>
-inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRow(
-    int i, int j, int first_col_num, Array3d<int>& dist_sums,
-    Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
+template <class T, typename IT, typename UIT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForElementInFirstRow(
+    int i, int j, int first_col_num, Array3d<IT>& dist_sums,
+    Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
 {
     int ay = border_size_ + i;
     int ax = border_size_ + j + template_window_half_size_;
@@ -350,10 +351,10 @@ inline void FastNlMeansMultiDenoisingInvoker<T>::calcDistSumsForElementInFirstRo
                 int by = start_by + y;
                 int bx = start_bx + x;
 
-                int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
+                IT* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
                 for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 {
-                    *col_dist_sums_ptr += calcDist<T>(
+                    *col_dist_sums_ptr += calcDist<T, IT>(
                                 main_extended_src_.at<T>(ay + ty, ax),
                                 cur_extended_src.at<T>(by + ty, bx));
                 }

From 8368fb9ea8dc03a5d09a3d701858ee272a9c818a Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 12 Feb 2015 18:45:09 +0100
Subject: [PATCH 02/40] Additional refactoring preparing for 16-bit
 implementation

---
 .../src/fast_nlmeans_denoising_invoker.hpp    |  12 +-
 ...fast_nlmeans_denoising_invoker_commons.hpp | 113 +++++++++++-------
 .../fast_nlmeans_multi_denoising_invoker.hpp  |   9 +-
 3 files changed, 83 insertions(+), 51 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 2ad0189ef..202e36013 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -107,7 +107,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     const float h) :
     src_(src), dst_(dst)
 {
-    CV_Assert(src.channels() == sizeof(T)); //T is Vec1b or Vec2b or Vec3b
+    CV_Assert(src.channels() == pixelInfo<T>::channels);
 
     template_window_half_size_ = template_window_size / 2;
     search_window_half_size_   = search_window_size   / 2;
@@ -117,17 +117,21 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     border_size_ = search_window_half_size_ + template_window_half_size_;
     copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT);
 
-    const IT max_estimate_sum_value = (IT)search_window_size_ * (IT)search_window_size_ * 255;
+    const IT max_estimate_sum_value =
+        (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
     fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
+    // squared distances are truncated to 16 bits to get a reasonable table size
     CV_Assert(template_window_size_ <= 46340); // sqrt(INT_MAX)
     int template_window_size_sq = template_window_size_ * template_window_size_;
-    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
+    almost_template_window_size_sq_bin_shift_ =
+        getNearestPowerOf2(template_window_size_sq) + 2*pixelInfo<T>::sampleBits() - 16;
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
-    IT max_dist = 255 * 255 * sizeof(T);
+    IT max_dist =
+        (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
     int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight_.resize(almost_max_dist);
 
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index e4e0a3a59..0a8713b91 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -44,30 +44,62 @@
 
 using namespace cv;
 
-template <typename T, typename IT> struct calcDist_
+template <typename T> struct pixelInfo_
 {
-    static inline IT f(const T a, const T b);
+    static const int channels = 1;
+    typedef T sampleType;
 };
 
-template <typename IT> struct calcDist_<uchar, IT>
+template <typename ET, int n> struct pixelInfo_<Vec<ET, n> >
 {
-    static inline IT f(uchar a, uchar b)
+    static const int channels = n;
+    typedef ET sampleType;
+};
+
+template <typename T> struct pixelInfo: public pixelInfo_<T>
+{
+    using typename pixelInfo_<T>::sampleType;
+
+    static inline sampleType sampleMax()
+    {
+        return std::numeric_limits<sampleType>::max();
+    }
+
+    static inline sampleType sampleMin()
+    {
+        return std::numeric_limits<sampleType>::min();
+    }
+
+    static inline size_t sampleBytes()
+    {
+        return sizeof(sampleType);
+    }
+
+    static inline size_t sampleBits()
+    {
+        return 8*sampleBytes();
+    }
+};
+
+template <typename T, typename IT> struct calcDist_
+{
+    static inline IT f(const T a, const T b)
     {
         return (IT)(a-b) * (IT)(a-b);
     }
 };
 
-template <typename IT> struct calcDist_<Vec2b, IT>
+template <typename ET, typename IT> struct calcDist_<Vec<ET, 2>, IT>
 {
-    static inline IT f(const Vec2b a, const Vec2b b)
+    static inline IT f(const Vec<ET, 2> a, const Vec<ET, 2> b)
     {
         return (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) + (IT)(a[1]-b[1])*(IT)(a[1]-b[1]);
     }
 };
 
-template <typename IT> struct calcDist_<Vec3b, IT>
+template <typename ET, typename IT> struct calcDist_<Vec<ET, 3>, IT>
 {
-    static inline IT f(const Vec3b a, const Vec3b b)
+    static inline IT f(const Vec<ET, 3> a, const Vec<ET, 3> b)
     {
         return
             (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
@@ -92,14 +124,6 @@ static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
 template <typename T, typename IT> struct calcUpDownDist_
 {
     static inline IT f(T a_up, T a_down, T b_up, T b_down)
-    {
-        return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
-    }
-};
-
-template <typename IT> struct calcUpDownDist_<uchar, IT>
-{
-    static inline IT f(uchar a_up, uchar a_down, uchar b_up, uchar b_down)
     {
         IT A = a_down - b_down;
         IT B = a_up - b_up;
@@ -107,6 +131,17 @@ template <typename IT> struct calcUpDownDist_<uchar, IT>
     }
 };
 
+template <typename ET, int n, typename IT> struct calcUpDownDist_<Vec<ET, n>, IT>
+{
+private:
+    typedef Vec<ET, n> T;
+public:
+    static inline IT f(T a_up, T a_down, T b_up, T b_down)
+    {
+        return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+    }
+};
+
 template <typename T, typename IT>
 static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
 {
@@ -115,29 +150,24 @@ static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
 
 template <typename T, typename IT> struct incWithWeight_
 {
-    static inline void f(IT* estimation, IT weight, T p);
-};
-
-template <typename IT> struct incWithWeight_<uchar, IT>
-{
-    static inline void f(IT* estimation, IT weight, uchar p)
+    static inline void f(IT* estimation, IT weight, T p)
     {
         estimation[0] += weight * p;
     }
 };
 
-template <typename IT> struct incWithWeight_<Vec2b, IT>
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT>
 {
-    static inline void f(IT* estimation, IT weight, Vec2b p)
+    static inline void f(IT* estimation, IT weight, Vec<ET, 2> p)
     {
         estimation[0] += weight * p[0];
         estimation[1] += weight * p[1];
     }
 };
 
-template <typename IT> struct incWithWeight_<Vec3b, IT>
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT>
 {
-    static inline void f(IT* estimation, IT weight, Vec3b p)
+    static inline void f(IT* estimation, IT weight, Vec<ET, 3> p)
     {
         estimation[0] += weight * p[0];
         estimation[1] += weight * p[1];
@@ -153,36 +183,31 @@ static inline void incWithWeight(IT* estimation, IT weight, T p)
 
 template <typename T, typename IT> struct saturateCastFromArray_
 {
-    static inline T f(IT* estimation);
-};
-
-template <typename IT> struct saturateCastFromArray_<uchar, IT>
-{
-    static inline uchar f(IT* estimation)
+    static inline T f(IT* estimation)
     {
-        return saturate_cast<uchar>(estimation[0]);
+        return saturate_cast<T>(estimation[0]);
     }
 };
 
-template <typename IT> struct saturateCastFromArray_<Vec2b, IT>
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 2>, IT>
 {
-    static inline Vec2b f(IT* estimation)
+    static inline Vec<ET, 2> f(IT* estimation)
     {
-        Vec2b res;
-        res[0] = saturate_cast<uchar>(estimation[0]);
-        res[1] = saturate_cast<uchar>(estimation[1]);
+        Vec<ET, 2> res;
+        res[0] = saturate_cast<ET>(estimation[0]);
+        res[1] = saturate_cast<ET>(estimation[1]);
         return res;
     }
 };
 
-template <typename IT> struct saturateCastFromArray_<Vec3b, IT>
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 3>, IT>
 {
-    static inline Vec3b f(IT* estimation)
+    static inline Vec<ET, 3> f(IT* estimation)
     {
-        Vec3b res;
-        res[0] = saturate_cast<uchar>(estimation[0]);
-        res[1] = saturate_cast<uchar>(estimation[1]);
-        res[2] = saturate_cast<uchar>(estimation[2]);
+        Vec<ET, 3> res;
+        res[0] = saturate_cast<ET>(estimation[0]);
+        res[1] = saturate_cast<ET>(estimation[1]);
+        res[2] = saturate_cast<ET>(estimation[2]);
         return res;
     }
 };
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 392733c08..48276b426 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -106,7 +106,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
         dst_(dst), extended_srcs_(srcImgs.size())
 {
     CV_Assert(srcImgs.size() > 0);
-    CV_Assert(srcImgs[0].channels() == sizeof(T));
+    CV_Assert(srcImgs[0].channels() == pixelInfo<T>::channels);
 
     rows_ = srcImgs[0].rows;
     cols_ = srcImgs[0].cols;
@@ -126,20 +126,23 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
 
     main_extended_src_ = extended_srcs_[temporal_window_half_size_];
     const IT max_estimate_sum_value =
-        (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * 255;
+        (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
     fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
+    // squared distances are truncated to 16 bits to get a reasonable table size
     int template_window_size_sq = template_window_size_ * template_window_size_;
     almost_template_window_size_sq_bin_shift = 0;
     while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq)
         almost_template_window_size_sq_bin_shift++;
+    almost_template_window_size_sq_bin_shift += 2*pixelInfo<T>::sampleBits() - 16;
 
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
 
-    IT max_dist = 255 * 255 * sizeof(T);
+    IT max_dist =
+        (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
     int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight.resize(almost_max_dist);
 

From 49e93747b17cae65915c66b326e37a94ddc53190 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 12 Feb 2015 22:05:05 +0100
Subject: [PATCH 03/40] Added saturate_cast from int64 and uint64

---
 modules/core/include/opencv2/core/base.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index f2acaa3fb..73beb911f 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -442,6 +442,10 @@ template<typename _Tp> static inline _Tp saturate_cast(int v)      { return _Tp(
 template<typename _Tp> static inline _Tp saturate_cast(float v)    { return _Tp(v); }
 /** @overload */
 template<typename _Tp> static inline _Tp saturate_cast(double v)   { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(int64 v)    { return _Tp(v); }
+/** @overload */
+template<typename _Tp> static inline _Tp saturate_cast(uint64 v)   { return _Tp(v); }
 
 //! @cond IGNORED
 
@@ -452,6 +456,8 @@ template<> inline uchar saturate_cast<uchar>(short v)        { return saturate_c
 template<> inline uchar saturate_cast<uchar>(unsigned v)     { return (uchar)std::min(v, (unsigned)UCHAR_MAX); }
 template<> inline uchar saturate_cast<uchar>(float v)        { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
 template<> inline uchar saturate_cast<uchar>(double v)       { int iv = cvRound(v); return saturate_cast<uchar>(iv); }
+template<> inline uchar saturate_cast<uchar>(int64 v)        { return (uchar)((uint64)v <= (uint64)UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+template<> inline uchar saturate_cast<uchar>(uint64 v)       { return (uchar)std::min(v, (uint64)UCHAR_MAX); }
 
 template<> inline schar saturate_cast<schar>(uchar v)        { return (schar)std::min((int)v, SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(ushort v)       { return (schar)std::min((unsigned)v, (unsigned)SCHAR_MAX); }
@@ -460,6 +466,8 @@ template<> inline schar saturate_cast<schar>(short v)        { return saturate_c
 template<> inline schar saturate_cast<schar>(unsigned v)     { return (schar)std::min(v, (unsigned)SCHAR_MAX); }
 template<> inline schar saturate_cast<schar>(float v)        { int iv = cvRound(v); return saturate_cast<schar>(iv); }
 template<> inline schar saturate_cast<schar>(double v)       { int iv = cvRound(v); return saturate_cast<schar>(iv); }
+template<> inline schar saturate_cast<schar>(int64 v)        { return (schar)((uint64)((int64)v-SCHAR_MIN) <= (uint64)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN); }
+template<> inline schar saturate_cast<schar>(uint64 v)       { return (schar)std::min(v, (uint64)SCHAR_MAX); }
 
 template<> inline ushort saturate_cast<ushort>(schar v)      { return (ushort)std::max((int)v, 0); }
 template<> inline ushort saturate_cast<ushort>(short v)      { return (ushort)std::max((int)v, 0); }
@@ -467,12 +475,16 @@ template<> inline ushort saturate_cast<ushort>(int v)        { return (ushort)((
 template<> inline ushort saturate_cast<ushort>(unsigned v)   { return (ushort)std::min(v, (unsigned)USHRT_MAX); }
 template<> inline ushort saturate_cast<ushort>(float v)      { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
 template<> inline ushort saturate_cast<ushort>(double v)     { int iv = cvRound(v); return saturate_cast<ushort>(iv); }
+template<> inline ushort saturate_cast<ushort>(int64 v)      { return (ushort)((uint64)v <= (uint64)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+template<> inline ushort saturate_cast<ushort>(uint64 v)     { return (ushort)std::min(v, (uint64)USHRT_MAX); }
 
 template<> inline short saturate_cast<short>(ushort v)       { return (short)std::min((int)v, SHRT_MAX); }
 template<> inline short saturate_cast<short>(int v)          { return (short)((unsigned)(v - SHRT_MIN) <= (unsigned)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
 template<> inline short saturate_cast<short>(unsigned v)     { return (short)std::min(v, (unsigned)SHRT_MAX); }
 template<> inline short saturate_cast<short>(float v)        { int iv = cvRound(v); return saturate_cast<short>(iv); }
 template<> inline short saturate_cast<short>(double v)       { int iv = cvRound(v); return saturate_cast<short>(iv); }
+template<> inline short saturate_cast<short>(int64 v)        { return (short)((uint64)((int64)v - SHRT_MIN) <= (uint64)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN); }
+template<> inline short saturate_cast<short>(uint64 v)       { return (short)std::min(v, (uint64)SHRT_MAX); }
 
 template<> inline int saturate_cast<int>(float v)            { return cvRound(v); }
 template<> inline int saturate_cast<int>(double v)           { return cvRound(v); }

From 42db9e7153a6d10b429df0bc2108278251c11ebc Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 12 Feb 2015 22:14:01 +0100
Subject: [PATCH 04/40] Basic 16-bit implmentation of fastNlMeansDenoising.
 Table-based exponetiation leads to high memory footprint and loss of
 precision in 16-bit mode.

---
 modules/photo/src/denoising.cpp               | 43 ++++++++++++++++---
 .../src/fast_nlmeans_denoising_invoker.hpp    | 14 +++---
 .../fast_nlmeans_multi_denoising_invoker.hpp  | 15 ++++---
 3 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 724ea0eb0..0abeefe5b 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -65,17 +65,32 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<uchar, int, unsigned>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<unsigned short, int64, uint64>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         default:
@@ -181,13 +196,31 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned int>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned int>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 202e36013..27a016ae9 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -123,11 +123,13 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 16 bits to get a reasonable table size
+    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
+    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
+    const size_t TABLE_MAX_BITS = 24;
     CV_Assert(template_window_size_ <= 46340); // sqrt(INT_MAX)
     int template_window_size_sq = template_window_size_ * template_window_size_;
-    almost_template_window_size_sq_bin_shift_ =
-        getNearestPowerOf2(template_window_size_sq) + 2*pixelInfo<T>::sampleBits() - 16;
+    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq) +
+        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
     IT max_dist =
@@ -139,7 +141,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
 
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
@@ -232,7 +234,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
 
             // calc weights
             IT estimation[3], weights_sum = 0;
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
 
             for (int y = 0; y < search_window_size_; y++)
@@ -250,7 +252,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum/2) / weights_sum;
 
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 48276b426..c90249b82 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -131,12 +131,15 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 16 bits to get a reasonable table size
+    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
+    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
+    const size_t TABLE_MAX_BITS = 24;
     int template_window_size_sq = template_window_size_ * template_window_size_;
     almost_template_window_size_sq_bin_shift = 0;
     while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq)
         almost_template_window_size_sq_bin_shift++;
-    almost_template_window_size_sq_bin_shift += 2*pixelInfo<T>::sampleBits() - 16;
+    almost_template_window_size_sq_bin_shift +=
+        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
 
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
@@ -150,7 +153,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
 
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
@@ -254,7 +257,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
             IT weights_sum = 0;
 
             IT estimation[3];
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
 
             for (int d = 0; d < temporal_window_size_; d++)
@@ -279,8 +282,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum; // ????
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
+                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum;
 
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
 

From d588c717da1ad2b77e03b058a281da3c00ba0327 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 13 Feb 2015 00:11:30 +0100
Subject: [PATCH 05/40] Using WEIGHT_THRESHOLD to limit table size. Still
 problematic with 16-bit and big h-values.

---
 .../src/fast_nlmeans_denoising_invoker.hpp    | 30 +++++++++----------
 .../fast_nlmeans_multi_denoising_invoker.hpp  | 29 +++++++++---------
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 27a016ae9..c9689cabd 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -123,31 +123,28 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
-    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
-    const size_t TABLE_MAX_BITS = 24;
     CV_Assert(template_window_size_ <= 46340); // sqrt(INT_MAX)
     int template_window_size_sq = template_window_size_ * template_window_size_;
-    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq) +
-        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
+    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
+    const double WEIGHT_THRESHOLD = 0.001;
+    const size_t ALLOC_CHUNK = 65536;
     IT max_dist =
         (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
-    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
-    almost_dist2weight_.resize(almost_max_dist);
-
-    const double WEIGHT_THRESHOLD = 0.001;
-    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
+    int almost_max_dist = 0;
+    while (true)
     {
-        double dist = almost_dist * almost_dist2actual_dist_multiplier;
+        double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
         IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
+        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_ || dist > max_dist) break;
 
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
+        if (almost_max_dist >= almost_dist2weight_.size())
+            almost_dist2weight_.resize(almost_max_dist + ALLOC_CHUNK);
 
-        almost_dist2weight_[almost_dist] = weight;
+        almost_dist2weight_[almost_max_dist++] = weight;
     }
+    almost_dist2weight_.resize(almost_max_dist);
     CV_Assert(almost_dist2weight_[0] == fixed_point_mult_);
 
     // additional optimization init end
@@ -161,6 +158,8 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
     int row_from = range.start;
     int row_to = range.end - 1;
 
+    int almost_max_dist = almost_dist2weight_.size();
+
     // sums of cols anf rows for current pixel p
     Array2d<IT> dist_sums(search_window_size_, search_window_size_);
 
@@ -244,7 +243,8 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
                 for (int x = 0; x < search_window_size_; x++)
                 {
                     int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_);
-                    IT weight = almost_dist2weight_[almostAvgDist];
+                    IT weight =
+                        almostAvgDist < almost_max_dist ? almost_dist2weight_[almostAvgDist] : 0;
                     weights_sum += weight;
 
                     T p = cur_row_ptr[border_size_ + search_window_x + x];
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index c90249b82..b4bfc0c6c 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -131,35 +131,31 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
-    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
-    const size_t TABLE_MAX_BITS = 24;
     int template_window_size_sq = template_window_size_ * template_window_size_;
     almost_template_window_size_sq_bin_shift = 0;
     while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq)
         almost_template_window_size_sq_bin_shift++;
-    almost_template_window_size_sq_bin_shift +=
-        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
 
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
 
+    const double WEIGHT_THRESHOLD = 0.001;
+    const size_t ALLOC_CHUNK = 65536;
     IT max_dist =
         (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
-    int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
-    almost_dist2weight.resize(almost_max_dist);
-
-    const double WEIGHT_THRESHOLD = 0.001;
-    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
+    int almost_max_dist = 0;
+    while (true)
     {
-        double dist = almost_dist * almost_dist2actual_dist_multiplier;
+        double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
         IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
+        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_ || dist > max_dist) break;
 
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
+        if (almost_max_dist >= almost_dist2weight.size())
+            almost_dist2weight.resize(almost_max_dist + ALLOC_CHUNK);
 
-        almost_dist2weight[almost_dist] = weight;
+        almost_dist2weight[almost_max_dist++] = weight;
     }
+    almost_dist2weight.resize(almost_max_dist);
     CV_Assert(almost_dist2weight[0] == fixed_point_mult_);
 
     // additional optimization init end
@@ -173,6 +169,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
     int row_from = range.start;
     int row_to = range.end - 1;
 
+    int almost_max_dist = almost_dist2weight.size();
+
     Array3d<IT> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
 
     // for lazy calc optimization
@@ -273,7 +271,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
                     {
                         int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift);
 
-                        IT weight = almost_dist2weight[almostAvgDist];
+                        IT weight =
+                            almostAvgDist < almost_max_dist ? almost_dist2weight[almostAvgDist] : 0;
                         weights_sum += weight;
 
                         T p = cur_row_ptr[border_size_ + search_window_x + x];

From 584372bbf297c386ce71357d70b65068551b9466 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 13 Feb 2015 04:33:29 +0100
Subject: [PATCH 06/40] Fixed bounds checking

---
 modules/photo/src/fast_nlmeans_denoising_invoker.hpp       | 6 +++---
 modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index c9689cabd..2de50a77b 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -132,7 +132,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     const size_t ALLOC_CHUNK = 65536;
     IT max_dist =
         (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
-    int almost_max_dist = 0;
+    size_t almost_max_dist = 0;
     while (true)
     {
         double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
@@ -158,7 +158,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
     int row_from = range.start;
     int row_to = range.end - 1;
 
-    int almost_max_dist = almost_dist2weight_.size();
+    size_t almost_max_dist = almost_dist2weight_.size();
 
     // sums of cols anf rows for current pixel p
     Array2d<IT> dist_sums(search_window_size_, search_window_size_);
@@ -242,7 +242,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
                 IT* dist_sums_row = dist_sums.row_ptr(y);
                 for (int x = 0; x < search_window_size_; x++)
                 {
-                    int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_);
+                    size_t almostAvgDist = (size_t)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_);
                     IT weight =
                         almostAvgDist < almost_max_dist ? almost_dist2weight_[almostAvgDist] : 0;
                     weights_sum += weight;
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index b4bfc0c6c..e0f06c68f 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -143,7 +143,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
     const size_t ALLOC_CHUNK = 65536;
     IT max_dist =
         (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
-    int almost_max_dist = 0;
+    size_t almost_max_dist = 0;
     while (true)
     {
         double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
@@ -169,7 +169,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
     int row_from = range.start;
     int row_to = range.end - 1;
 
-    int almost_max_dist = almost_dist2weight.size();
+    size_t almost_max_dist = almost_dist2weight.size();
 
     Array3d<IT> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
 
@@ -269,7 +269,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
 
                     for (int x = 0; x < search_window_size_; x++)
                     {
-                        int almostAvgDist = (int)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift);
+                        size_t almostAvgDist = (size_t)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift);
 
                         IT weight =
                             almostAvgDist < almost_max_dist ? almost_dist2weight[almostAvgDist] : 0;

From c339720af9cf93ee0c130c55b2a7d2621bca72dc Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 13 Feb 2015 13:38:37 +0100
Subject: [PATCH 07/40] Preparation for 16-bit colored denoising. Currently not
 working due to cvtColor not supportint 16-bit Lab conversion.

---
 modules/photo/src/denoising.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 0abeefe5b..8f9d1f84a 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -80,7 +80,7 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
             break;
         case CV_16U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<unsigned short, int64, uint64>(
+                FastNlMeansDenoisingInvoker<ushort, int64, uint64>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_16UC2:
@@ -95,7 +95,7 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
             break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8UC1, CV_8UC2 and CV_8UC3 are supported");
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_16U, CV_16UC2, and CV_16UC3 are supported");
     }
 }
 
@@ -105,9 +105,9 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     Size src_size = _src.size();
-    if (type != CV_8UC3 && type != CV_8UC4)
+    if (type != CV_8UC3 && type != CV_16UC3 && type != CV_8UC4 && type != CV_16UC4)
     {
-        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3!");
+        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3, CV_16UC3, CV_8UC4, or CV_16UC4");
         return;
     }
 
@@ -123,8 +123,8 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
     Mat src_lab;
     cvtColor(src, src_lab, COLOR_LBGR2Lab);
 
-    Mat l(src_size, CV_8U);
-    Mat ab(src_size, CV_8UC2);
+    Mat l(src_size, CV_MAKE_TYPE(depth, 1));
+    Mat ab(src_size, CV_MAKE_TYPE(depth, 2));
     Mat l_ab[] = { l, ab };
     int from_to[] = { 0,0, 1,1, 2,2 };
     mixChannels(&src_lab, 1, l_ab, 2, from_to, 3);
@@ -190,7 +190,7 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
     {
         case CV_8U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned int>(
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
@@ -226,7 +226,7 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
             break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported matrix format! Only uchar, Vec2b, Vec3b are supported");
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_16U, CV_16UC2, and CV_16UC3 are supported");
     }
 }
 
@@ -245,11 +245,12 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
     _dst.create(srcImgs[0].size(), srcImgs[0].type());
     Mat dst = _dst.getMat();
 
+    int type = srcImgs[0].type(), depth = CV_MAT_DEPTH(type);
     int src_imgs_size = static_cast<int>(srcImgs.size());
 
-    if (srcImgs[0].type() != CV_8UC3)
+    if (type != CV_8UC3 && type != CV_16UC3)
     {
-        CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3!");
+        CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3 or CV_16UC3!");
         return;
     }
 
@@ -261,9 +262,9 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
     std::vector<Mat> ab(src_imgs_size);
     for (int i = 0; i < src_imgs_size; i++)
     {
-        src_lab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC3);
-        l[i] = Mat::zeros(srcImgs[0].size(), CV_8UC1);
-        ab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC2);
+        src_lab[i] = Mat::zeros(srcImgs[0].size(), type);
+        l[i] = Mat::zeros(srcImgs[0].size(), CV_MAKE_TYPE(depth, 1));
+        ab[i] = Mat::zeros(srcImgs[0].size(), CV_MAKE_TYPE(depth, 2));
         cvtColor(srcImgs[i], src_lab[i], COLOR_LBGR2Lab);
 
         Mat l_ab[] = { l[i], ab[i] };

From baf266c29eea897d13ae9ce0f85539a74f264b5f Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 17 Feb 2015 21:30:52 +0100
Subject: [PATCH 08/40] Changed from sum of squared differences to sum of abs
 differences

---
 .../src/fast_nlmeans_denoising_invoker.hpp    |  5 ++-
 ...fast_nlmeans_denoising_invoker_commons.hpp | 32 +++----------------
 .../fast_nlmeans_multi_denoising_invoker.hpp  |  5 ++-
 3 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 2de50a77b..cbf9d259f 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -130,13 +130,12 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
 
     const double WEIGHT_THRESHOLD = 0.001;
     const size_t ALLOC_CHUNK = 65536;
-    IT max_dist =
-        (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
+    IT max_dist = (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
     size_t almost_max_dist = 0;
     while (true)
     {
         double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_ || dist > max_dist) break;
 
         if (almost_max_dist >= almost_dist2weight_.size())
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index 0a8713b91..4ca63d652 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -85,7 +85,7 @@ template <typename T, typename IT> struct calcDist_
 {
     static inline IT f(const T a, const T b)
     {
-        return (IT)(a-b) * (IT)(a-b);
+        return std::abs((IT)(a-b));
     }
 };
 
@@ -93,7 +93,7 @@ template <typename ET, typename IT> struct calcDist_<Vec<ET, 2>, IT>
 {
     static inline IT f(const Vec<ET, 2> a, const Vec<ET, 2> b)
     {
-        return (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) + (IT)(a[1]-b[1])*(IT)(a[1]-b[1]);
+        return std::abs((IT)(a[0]-b[0])) + std::abs((IT)(a[1]-b[1]));
     }
 };
 
@@ -101,10 +101,7 @@ template <typename ET, typename IT> struct calcDist_<Vec<ET, 3>, IT>
 {
     static inline IT f(const Vec<ET, 3> a, const Vec<ET, 3> b)
     {
-        return
-            (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
-            (IT)(a[1]-b[1])*(IT)(a[1]-b[1]) +
-            (IT)(a[2]-b[2])*(IT)(a[2]-b[2]);
+        return std::abs((IT)(a[0]-b[0])) + std::abs((IT)(a[1]-b[1])) + std::abs((IT)(a[2]-b[2]));
     }
 };
 
@@ -121,31 +118,10 @@ static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
     return calcDist<T, IT>(a,b);
 }
 
-template <typename T, typename IT> struct calcUpDownDist_
-{
-    static inline IT f(T a_up, T a_down, T b_up, T b_down)
-    {
-        IT A = a_down - b_down;
-        IT B = a_up - b_up;
-        return (A-B)*(A+B);
-    }
-};
-
-template <typename ET, int n, typename IT> struct calcUpDownDist_<Vec<ET, n>, IT>
-{
-private:
-    typedef Vec<ET, n> T;
-public:
-    static inline IT f(T a_up, T a_down, T b_up, T b_down)
-    {
-        return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
-    }
-};
-
 template <typename T, typename IT>
 static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
 {
-    return calcUpDownDist_<T, IT>::f(a_up, a_down, b_up, b_down);
+    return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
 };
 
 template <typename T, typename IT> struct incWithWeight_
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index e0f06c68f..f12a0ef50 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -141,13 +141,12 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
 
     const double WEIGHT_THRESHOLD = 0.001;
     const size_t ALLOC_CHUNK = 65536;
-    IT max_dist =
-        (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
+    IT max_dist = (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
     size_t almost_max_dist = 0;
     while (true)
     {
         double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_ || dist > max_dist) break;
 
         if (almost_max_dist >= almost_dist2weight.size())

From e647b7c7e8a15765f7a18ed496fd2313338b900f Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 17 Feb 2015 23:08:36 +0100
Subject: [PATCH 09/40] Calculating almost_dist2weight at full size to avoid
 bounds checking

---
 .../src/fast_nlmeans_denoising_invoker.hpp    | 31 ++++++++-----------
 .../fast_nlmeans_multi_denoising_invoker.hpp  | 31 ++++++++-----------
 2 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index cbf9d259f..a641c990e 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -128,22 +128,20 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
-    const double WEIGHT_THRESHOLD = 0.001;
-    const size_t ALLOC_CHUNK = 65536;
     IT max_dist = (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
-    size_t almost_max_dist = 0;
-    while (true)
-    {
-        double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_ || dist > max_dist) break;
-
-        if (almost_max_dist >= almost_dist2weight_.size())
-            almost_dist2weight_.resize(almost_max_dist + ALLOC_CHUNK);
-
-        almost_dist2weight_[almost_max_dist++] = weight;
-    }
+    size_t almost_max_dist = (size_t)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight_.resize(almost_max_dist);
+
+    const double WEIGHT_THRESHOLD = 0.001;
+    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
+    {
+        double dist = almost_dist * almost_dist2actual_dist_multiplier;
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
+        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
+            weight = 0;
+
+        almost_dist2weight_[almost_dist] = weight;
+    }
     CV_Assert(almost_dist2weight_[0] == fixed_point_mult_);
 
     // additional optimization init end
@@ -157,8 +155,6 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
     int row_from = range.start;
     int row_to = range.end - 1;
 
-    size_t almost_max_dist = almost_dist2weight_.size();
-
     // sums of cols anf rows for current pixel p
     Array2d<IT> dist_sums(search_window_size_, search_window_size_);
 
@@ -242,8 +238,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
                 for (int x = 0; x < search_window_size_; x++)
                 {
                     size_t almostAvgDist = (size_t)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_);
-                    IT weight =
-                        almostAvgDist < almost_max_dist ? almost_dist2weight_[almostAvgDist] : 0;
+                    IT weight = almost_dist2weight_[almostAvgDist];
                     weights_sum += weight;
 
                     T p = cur_row_ptr[border_size_ + search_window_x + x];
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index f12a0ef50..808b01f50 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -139,22 +139,20 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
 
-    const double WEIGHT_THRESHOLD = 0.001;
-    const size_t ALLOC_CHUNK = 65536;
     IT max_dist = (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
-    size_t almost_max_dist = 0;
-    while (true)
-    {
-        double dist = almost_max_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_ || dist > max_dist) break;
-
-        if (almost_max_dist >= almost_dist2weight.size())
-            almost_dist2weight.resize(almost_max_dist + ALLOC_CHUNK);
-
-        almost_dist2weight[almost_max_dist++] = weight;
-    }
+    int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight.resize(almost_max_dist);
+
+    const double WEIGHT_THRESHOLD = 0.001;
+    for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
+    {
+        double dist = almost_dist * almost_dist2actual_dist_multiplier;
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
+        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
+            weight = 0;
+
+        almost_dist2weight[almost_dist] = weight;
+    }
     CV_Assert(almost_dist2weight[0] == fixed_point_mult_);
 
     // additional optimization init end
@@ -168,8 +166,6 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
     int row_from = range.start;
     int row_to = range.end - 1;
 
-    size_t almost_max_dist = almost_dist2weight.size();
-
     Array3d<IT> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
 
     // for lazy calc optimization
@@ -270,8 +266,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
                     {
                         size_t almostAvgDist = (size_t)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift);
 
-                        IT weight =
-                            almostAvgDist < almost_max_dist ? almost_dist2weight[almostAvgDist] : 0;
+                        IT weight =  almost_dist2weight[almostAvgDist];
                         weights_sum += weight;
 
                         T p = cur_row_ptr[border_size_ + search_window_x + x];

From 8e7aff44869439f04ef9c0f3ae43b7c6f143c715 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Wed, 18 Feb 2015 14:59:52 +0100
Subject: [PATCH 10/40] Changed fastNlMeansDenoising and
 fastNlMeansDenoisingMulti back to sum of sq distances. Moved sq sum of abs
 distances to fastNlMeansDenoisingAbs and fastNlMeansDenoisingMultiAbs

---
 modules/photo/include/opencv2/photo.hpp       |  56 ++++++
 modules/photo/src/denoising.cpp               | 102 +++++++++--
 .../src/fast_nlmeans_denoising_invoker.hpp    |  28 +--
 ...fast_nlmeans_denoising_invoker_commons.hpp | 159 +++++++++++++++---
 .../fast_nlmeans_multi_denoising_invoker.hpp  |  28 +--
 5 files changed, 301 insertions(+), 72 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index 2d1087e89..c25a35e6d 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -138,6 +138,31 @@ parameter.
 CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h = 3,
         int templateWindowSize = 7, int searchWindowSize = 21);
 
+/** @brief Perform image denoising using Non-local Means Denoising
+algorithm <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/>
+with several computational optimizations. Noise expected to be a
+gaussian white noise. Uses squared sum of absolute value distances
+instead of sum of squared distances for weight calculation
+
+@param src Input 8-bit or 16-bit 1-channel, 2-channel or 3-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
+removes image details, smaller h value preserves details but also preserves some noise
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst, float h = 3,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
 /** @brief Modification of fastNlMeansDenoising function for colored images
 
 @param src Input 8-bit 3-channel image.
@@ -186,6 +211,37 @@ CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputA
         int imgToDenoiseIndex, int temporalWindowSize,
         float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
 
+/** @brief Modification of fastNlMeansDenoising function for images
+sequence where consequtive images have been captured in small period
+of time. For example video. This version of the function is for
+grayscale images or for manual manipulation with colorspaces. For more
+details see
+<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>. Uses
+squared sum of absolute value distances instead of sum of squared
+distances for weight calculation
+
+@param srcImgs Input 8-bit or 16-bit 1-channel, 2-channel or 3-channel
+images sequence. All images should
+have the same type and size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
+removes noise but also removes image details, smaller h value preserves details but also preserves
+some noise
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMultiAbs( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
 
 @param srcImgs Input 8-bit 3-channel images sequence. All images should have the same type and
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 8f9d1f84a..52065b5f6 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -65,32 +65,62 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar, int, unsigned>(
+                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned>(
+                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, and CV_8UC3 are supported");
+    }
+}
+
+void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
+                                  int  templateWindowSize, int searchWindowSize)
+{
+    Size src_size = _src.size();
+    Mat src = _src.getMat();
+    _dst.create(src_size, src.type());
+    Mat dst = _dst.getMat();
+
+    switch (src.type()) {
+        case CV_8U:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC2:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC3:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_16U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<ushort, int64, uint64>(
+                FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_16UC2:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_16UC3:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         default:
@@ -105,9 +135,9 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     Size src_size = _src.size();
-    if (type != CV_8UC3 && type != CV_16UC3 && type != CV_8UC4 && type != CV_16UC4)
+    if (type != CV_8UC3 && type != CV_8UC4)
     {
-        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3, CV_16UC3, CV_8UC4, or CV_16UC4");
+        CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3 or CV_8UC4!");
         return;
     }
 
@@ -190,37 +220,77 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
     {
         case CV_8U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned>(
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, and CV_8UC3 are supported");
+    }
+}
+
+void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray _dst,
+                                       int imgToDenoiseIndex, int temporalWindowSize,
+                                       float h, int templateWindowSize, int searchWindowSize)
+{
+    std::vector<Mat> srcImgs;
+    _srcImgs.getMatVector(srcImgs);
+
+    fastNlMeansDenoisingMultiCheckPreconditions(
+        srcImgs, imgToDenoiseIndex,
+        temporalWindowSize, templateWindowSize, searchWindowSize);
+
+    _dst.create(srcImgs[0].size(), srcImgs[0].type());
+    Mat dst = _dst.getMat();
+
+    switch (srcImgs[0].type())
+    {
+        case CV_8U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC2:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC3:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_16U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64>(
+                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_16UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_16UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
@@ -248,9 +318,9 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr
     int type = srcImgs[0].type(), depth = CV_MAT_DEPTH(type);
     int src_imgs_size = static_cast<int>(srcImgs.size());
 
-    if (type != CV_8UC3 && type != CV_16UC3)
+    if (type != CV_8UC3)
     {
-        CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3 or CV_16UC3!");
+        CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3!");
         return;
     }
 
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index a641c990e..468fa82f7 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -50,7 +50,7 @@
 
 using namespace cv;
 
-template <typename T, typename IT, typename UIT>
+template <typename T, typename IT, typename UIT, typename D>
 struct FastNlMeansDenoisingInvoker :
         public ParallelLoopBody
 {
@@ -99,8 +99,8 @@ inline int getNearestPowerOf2(int value)
     return p;
 }
 
-template <class T, typename IT, typename UIT>
-FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D>
+FastNlMeansDenoisingInvoker<T, IT, UIT, D>::FastNlMeansDenoisingInvoker(
     const Mat& src, Mat& dst,
     int template_window_size,
     int search_window_size,
@@ -128,7 +128,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
-    IT max_dist = (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
+    IT max_dist = D::template maxDist<T, IT>();
     size_t almost_max_dist = (size_t)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight_.resize(almost_max_dist);
 
@@ -136,7 +136,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
+        IT weight = (IT)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
 
@@ -149,8 +149,8 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
         dst_ = Mat::zeros(src_.size(), src_.type());
 }
 
-template <class T, typename IT, typename UIT>
-void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D>
+void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
@@ -215,7 +215,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
                             dist_sums_row[x] -= col_dist_sums_row[x];
 
                             int bx = start_bx + x;
-                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
+                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + D::template calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
 
                             dist_sums_row[x] += col_dist_sums_row[x];
                             up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -254,8 +254,8 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
     }
 }
 
-template <class T, typename IT, typename UIT>
-inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElementInRow(
     int i,
     Array2d<IT>& dist_sums,
     Array3d<IT>& col_dist_sums,
@@ -276,7 +276,7 @@ inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElement
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                 {
-                    int dist = calcDist<T, IT>(extended_src_,
+                    int dist = D::template calcDist<T, IT>(extended_src_,
                         border_size_ + i + ty, border_size_ + j + tx,
                         border_size_ + start_y + ty, border_size_ + start_x + tx);
 
@@ -288,8 +288,8 @@ inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElement
         }
 }
 
-template <class T, typename IT, typename UIT>
-inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num,
     Array2d<IT>& dist_sums,
     Array3d<IT>& col_dist_sums,
@@ -312,7 +312,7 @@ inline void FastNlMeansDenoisingInvoker<T, IT, UIT>::calcDistSumsForElementInFir
             int by = start_by + y;
             int bx = start_bx + x;
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
-                col_dist_sums[new_last_col_num][y][x] += calcDist<T,IT>(extended_src_, ay + ty, ax, by + ty, bx);
+                col_dist_sums[new_last_col_num][y][x] += D::template calcDist<T,IT>(extended_src_, ay + ty, ax, by + ty, bx);
 
             dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x];
             up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x];
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index 4ca63d652..d55d93ce7 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -81,47 +81,150 @@ template <typename T> struct pixelInfo: public pixelInfo_<T>
     }
 };
 
-template <typename T, typename IT> struct calcDist_
+class DistAbs
 {
-    static inline IT f(const T a, const T b)
+    template <typename T, typename IT> struct calcDist_
     {
-        return std::abs((IT)(a-b));
+        static inline IT f(const T a, const T b)
+        {
+            return std::abs((IT)(a-b));
+        }
+    };
+
+    template <typename ET, typename IT> struct calcDist_<Vec<ET, 2>, IT>
+    {
+        static inline IT f(const Vec<ET, 2> a, const Vec<ET, 2> b)
+        {
+            return std::abs((IT)(a[0]-b[0])) + std::abs((IT)(a[1]-b[1]));
+        }
+    };
+
+    template <typename ET, typename IT> struct calcDist_<Vec<ET, 3>, IT>
+    {
+        static inline IT f(const Vec<ET, 3> a, const Vec<ET, 3> b)
+        {
+            return
+                std::abs((IT)(a[0]-b[0])) +
+                std::abs((IT)(a[1]-b[1])) +
+                std::abs((IT)(a[2]-b[2]));
+        }
+    };
+
+public:
+    template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
+    {
+        return calcDist_<T, IT>::f(a, b);
+    }
+
+    template <typename T, typename IT>
+    static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+    {
+        const T a = m.at<T>(i1, j1);
+        const T b = m.at<T>(i2, j2);
+        return calcDist<T, IT>(a,b);
+    }
+
+    template <typename T, typename IT>
+    static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+    {
+        return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+    };
+
+    template <typename T>
+    static double calcWeight(double dist, double h)
+    {
+        return std::exp(-dist*dist / (h * h * pixelInfo<T>::channels));
+    }
+
+    template <typename T, typename IT>
+    static double maxDist()
+    {
+        return (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
     }
 };
 
-template <typename ET, typename IT> struct calcDist_<Vec<ET, 2>, IT>
+class DistSquared
 {
-    static inline IT f(const Vec<ET, 2> a, const Vec<ET, 2> b)
+    template <typename T, typename IT> struct calcDist_
     {
-        return std::abs((IT)(a[0]-b[0])) + std::abs((IT)(a[1]-b[1]));
-    }
-};
+        static inline IT f(const T a, const T b)
+        {
+            return (IT)(a-b) * (IT)(a-b);
+        }
+    };
 
-template <typename ET, typename IT> struct calcDist_<Vec<ET, 3>, IT>
-{
-    static inline IT f(const Vec<ET, 3> a, const Vec<ET, 3> b)
+    template <typename ET, typename IT> struct calcDist_<Vec<ET, 2>, IT>
     {
-        return std::abs((IT)(a[0]-b[0])) + std::abs((IT)(a[1]-b[1])) + std::abs((IT)(a[2]-b[2]));
+        static inline IT f(const Vec<ET, 2> a, const Vec<ET, 2> b)
+        {
+            return (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) + (IT)(a[1]-b[1])*(IT)(a[1]-b[1]);
+        }
+    };
+
+    template <typename ET, typename IT> struct calcDist_<Vec<ET, 3>, IT>
+    {
+        static inline IT f(const Vec<ET, 3> a, const Vec<ET, 3> b)
+        {
+            return
+                (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
+                (IT)(a[1]-b[1])*(IT)(a[1]-b[1]) +
+                (IT)(a[2]-b[2])*(IT)(a[2]-b[2]);
+        }
+    };
+
+    template <typename T, typename IT> struct calcUpDownDist_
+    {
+        static inline IT f(T a_up, T a_down, T b_up, T b_down)
+        {
+            IT A = a_down - b_down;
+            IT B = a_up - b_up;
+            return (A-B)*(A+B);
+        }
+    };
+
+    template <typename ET, int n, typename IT> struct calcUpDownDist_<Vec<ET, n>, IT>
+    {
+    private:
+        typedef Vec<ET, n> T;
+    public:
+        static inline IT f(T a_up, T a_down, T b_up, T b_down)
+        {
+            return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+        }
+    };
+
+public:
+    template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
+    {
+        return calcDist_<T, IT>::f(a, b);
     }
-};
 
-template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
-{
-    return calcDist_<T, IT>::f(a, b);
-}
+    template <typename T, typename IT>
+    static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+    {
+        const T a = m.at<T>(i1, j1);
+        const T b = m.at<T>(i2, j2);
+        return calcDist<T, IT>(a,b);
+    }
 
-template <typename T, typename IT>
-static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
-{
-    const T a = m.at<T>(i1, j1);
-    const T b = m.at<T>(i2, j2);
-    return calcDist<T, IT>(a,b);
-}
+    template <typename T, typename IT>
+    static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+    {
+        return calcUpDownDist_<T, IT>::f(a_up, a_down, b_up, b_down);
+    };
 
-template <typename T, typename IT>
-static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
-{
-    return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+    template <typename T>
+    static double calcWeight(double dist, double h)
+    {
+        return std::exp(-dist / (h * h * pixelInfo<T>::channels));
+    }
+
+    template <typename T, typename IT>
+    static double maxDist()
+    {
+        return (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() *
+            (IT)pixelInfo<T>::channels;
+    }
 };
 
 template <typename T, typename IT> struct incWithWeight_
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 808b01f50..0a2bdd739 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -50,7 +50,7 @@
 
 using namespace cv;
 
-template <typename T, typename IT, typename UIT>
+template <typename T, typename IT, typename UIT, typename D>
 struct FastNlMeansMultiDenoisingInvoker :
         ParallelLoopBody
 {
@@ -94,8 +94,8 @@ private:
                                           Array4d<IT>& up_col_dist_sums) const;
 };
 
-template <class T, typename IT, typename UIT>
-FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D>
+FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::FastNlMeansMultiDenoisingInvoker(
     const std::vector<Mat>& srcImgs,
     int imgToDenoiseIndex,
     int temporalWindowSize,
@@ -139,7 +139,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
 
-    IT max_dist = (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
+    IT max_dist = D::template maxDist<T,IT>();
     int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight.resize(almost_max_dist);
 
@@ -147,7 +147,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist*dist / (h * h * pixelInfo<T>::channels)));
+        IT weight = (IT)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
 
@@ -160,8 +160,8 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
         dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type());
 }
 
-template <class T, typename IT, typename UIT>
-void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D>
+void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
@@ -234,7 +234,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
                                 dist_sums_row[x] -= col_dist_sums_row[x];
 
                                 col_dist_sums_row[x] = up_col_dist_sums_row[x] +
-                                    calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
+                                    D::template calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
 
                                 dist_sums_row[x] += col_dist_sums_row[x];
                                 up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -284,8 +284,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
     }
 }
 
-template <class T, typename IT, typename UIT>
-inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElementInRow(
         int i, Array3d<IT>& dist_sums, Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
 {
     int j = 0;
@@ -310,7 +310,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstEl
                 {
                     for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                     {
-                        IT dist = calcDist<T, IT>(
+                        IT dist = D::template calcDist<T, IT>(
                                     main_extended_src_.at<T>(border_size_ + i + ty, border_size_ + j + tx),
                                     cur_extended_src.at<T>(border_size_ + start_y + ty, border_size_ + start_x + tx));
 
@@ -325,8 +325,8 @@ inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForFirstEl
     }
 }
 
-template <class T, typename IT, typename UIT>
-inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num, Array3d<IT>& dist_sums,
     Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
 {
@@ -353,7 +353,7 @@ inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::calcDistSumsForElement
                 IT* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
                 for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 {
-                    *col_dist_sums_ptr += calcDist<T, IT>(
+                    *col_dist_sums_ptr += D::template calcDist<T, IT>(
                                 main_extended_src_.at<T>(ay + ty, ax),
                                 cur_extended_src.at<T>(by + ty, bx));
                 }

From 0fdb95e195c499a6aa9a02d5a49abed9709b4258 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Sun, 1 Mar 2015 20:59:34 +0100
Subject: [PATCH 11/40] Refactoring and addition of CV_8UC3 to
 ocl_fastNlMeansDenoising

---
 .../src/fast_nlmeans_denoising_opencl.hpp     | 44 +++++++++---
 modules/photo/src/opencl/nlmeans.cl           | 72 ++++++++++++-------
 2 files changed, 78 insertions(+), 38 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index 1cdd8fa49..cd7dde385 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -70,11 +70,11 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
 static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
                                      int templateWindowSize, int searchWindowSize)
 {
-    int type = _src.type(), cn = CV_MAT_CN(type);
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
     Size size = _src.size();
 
-    if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC4 )
+    if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC3 )
         return false;
 
     int templateWindowHalfWize = templateWindowSize / 2;
@@ -86,13 +86,15 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
 
     char cvt[2][40];
     String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
-                         " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
+                         " -D sample_t=%s -D pixel_t=%s -D int_t=%s"
+                         " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
-                         " -D convert_int_t=%s -D cn=%d -D convert_uchar_t=%s",
-                         templateWindowSize, searchWindowSize, ocl::typeToStr(type),
-                         ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, ctaSize,
-                         templateWindowHalfWize, searchWindowHalfSize,
-                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn,
+                         " -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s",
+                         templateWindowSize, searchWindowSize,
+                         ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
+                         BLOCK_COLS, BLOCK_ROWS,
+                         ctaSize, templateWindowHalfWize, searchWindowHalfSize,
+                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), type == CV_8UC3 ? 4 : cn,
                          ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
 
     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
@@ -107,10 +109,22 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
 
     UMat srcex;
     int borderSize = searchWindowHalfSize + templateWindowHalfWize;
-    copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+    if (type == CV_8UC3) {
+        Mat src_rgb = _src.getMat(), src_rgba(size, CV_8UC4);
+        int from_to[] = { 0,0, 1,1, 2,2 };
+        mixChannels(&src_rgb, 1, &src_rgba, 1, from_to, 3);
+        copyMakeBorder(src_rgba, srcex,
+                       borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+    }
+    else
+        copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
 
     _dst.create(size, type);
-    UMat dst = _dst.getUMat();
+    UMat dst;
+    if (type == CV_8UC3)
+        dst.create(size, CV_8UC4);
+    else
+        dst = _dst.getUMat();
 
     int searchWindowSizeSq = searchWindowSize * searchWindowSize;
     Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
@@ -123,7 +137,15 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
            ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
 
     size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
-    return k.run(2, globalsize, localsize, false);
+    if (!k.run(2, globalsize, localsize, false)) return false;
+
+    if (type == CV_8UC3) {
+        Mat dst_rgba = dst.getMat(ACCESS_READ), dst_rgb = _dst.getMat();
+        int from_to[] = { 0,0, 1,1, 2,2 };
+        mixChannels(&dst_rgba, 1, &dst_rgb, 1, from_to, 3);
+    }
+
+    return true;
 }
 
 static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl
index af3fb1f9b..c48adda0b 100644
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -29,8 +29,11 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
     if (almostDist < almostMaxDist)
     {
         FT dist = almostDist * almostDist2ActualDistMultiplier;
+#ifdef ABS
+        int weight = convert_int_sat_rte(fixedPointMult * exp(-dist*dist * den));
+#else
         int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
-
+#endif
         if (weight < WEIGHT_THRESHOLD * fixedPointMult)
             weight = 0;
 
@@ -44,21 +47,33 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
 
 #define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE)
 
-inline int calcDist(uchar_t a, uchar_t b)
+inline int calcDist(pixel_t a, pixel_t b)
 {
+#ifdef ABS
+    int_t retval = convert_int_t(abs_diff(a, b));
+#else
     int_t diff = convert_int_t(a) - convert_int_t(b);
     int_t retval = diff * diff;
+#endif
 
 #if cn == 1
     return retval;
 #elif cn == 2
     return retval.x + retval.y;
+#elif cn == 3 || cn == 4       /* A is ignored */
+    return retval.x + retval.y + retval.z;
 #else
-#error "cn should be either 1 or 2"
+#error "cn should be either 1, 2, 3 or 4"
 #endif
 }
 
-inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t)
+#ifdef ABS
+inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
+{
+    return calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t);
+}
+#else
+inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_value, pixel_t up_value_t)
 {
     int_t A = convert_int_t(down_value) - convert_int_t(down_value_t);
     int_t B = convert_int_t(up_value) - convert_int_t(up_value_t);
@@ -68,14 +83,17 @@ inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_v
     return retval;
 #elif cn == 2
     return retval.x + retval.y;
+#elif cn == 3 || cn == 4        /* A is ignored */
+    return retval.x + retval.y + retval.z;
 #else
-#error "cn should be either 1 or 2"
+#error "cn should be either 1, 2, 3 or 4"
 #endif
 }
+#endif
 
 #define COND if (x == 0 && y == 0)
 
-inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset,
+inline void calcFirstElementInRow(__global const sample_t * src, int src_step, int src_offset,
                                   __local int * dists, int y, int x, int id,
                                   __global int * col_dists, __global int * up_col_dists)
 {
@@ -87,9 +105,9 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
     {
         int dist = 0, value;
 
-        __global const uchar_t * src_template = (__global const uchar_t *)(src +
+        __global const pixel_t * src_template = (__global const pixel_t *)(src +
             mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
-        __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
         __global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
 
         #pragma unroll
@@ -107,8 +125,8 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
                 dist += value;
             }
 
-            src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
-            src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
+            src_current = (__global const pixel_t *)((__global const sample_t *)src_current + src_step);
+            src_template = (__global const pixel_t *)((__global const sample_t *)src_template + src_step);
         }
 
         #pragma unroll
@@ -120,7 +138,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int
     }
 }
 
-inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,
+inline void calcElementInFirstRow(__global const sample_t * src, int src_step, int src_offset,
                                   __local int * dists, int y, int x0, int x, int id, int first,
                                   __global int * col_dists, __global int * up_col_dists)
 {
@@ -130,8 +148,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
 
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
-        __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
-        __global const uchar_t * src_template = (__global const uchar_t *)(src +
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
+        __global const pixel_t * src_template = (__global const pixel_t *)(src +
             mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
         __global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
 
@@ -142,8 +160,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
         {
             col_dist += calcDist(src_current[0], src_template[0]);
 
-            src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step);
-            src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step);
+            src_current = (__global const pixel_t *)((__global const sample_t *)src_current + src_step);
+            src_template = (__global const pixel_t *)((__global const sample_t *)src_template + src_step);
         }
 
         dists[i] += col_dist - col_dists_current[first];
@@ -152,7 +170,7 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int
     }
 }
 
-inline void calcElement(__global const uchar * src, int src_step, int src_offset,
+inline void calcElement(__global const sample_t * src, int src_step, int src_offset,
                         __local int * dists, int y, int x0, int x, int id, int first,
                         __global int * col_dists, __global int * up_col_dists)
 {
@@ -160,8 +178,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
     int sy_up = y - TEMPLATE_SIZE2 - 1;
     int sy_down = y + TEMPLATE_SIZE2;
 
-    uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
-    uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
+    pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
+    pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
 
     sx -= SEARCH_SIZE2;
     sy_up -= SEARCH_SIZE2;
@@ -171,8 +189,8 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
     {
         int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
 
-        uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
-        uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
+        pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
+        pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
 
         __global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
         __global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
@@ -185,9 +203,9 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
     }
 }
 
-inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
+inline void convolveWindow(__global const sample_t * src, int src_step, int src_offset,
                            __local int * dists, __global const int * almostDist2Weight,
-                           __global uchar * dst, int dst_step, int dst_offset,
+                           __global sample_t * dst, int dst_step, int dst_offset,
                            int y, int x, int id, __local int * weights_local,
                            __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
 {
@@ -197,7 +215,7 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
         int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
-        int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index));
+        int_t src_value = convert_int_t(*(__global const pixel_t *)(src + src_index));
 
         int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
         int weight = almostDist2Weight[almostAvgDist];
@@ -228,13 +246,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
             weighted_sum_local[2] + weighted_sum_local[3];
         int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
 
-        *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / (int_t)(weights_local_0));
+        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (int_t)(weights_local_0));
     }
 }
 
-__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
-                                   __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                                   __global const int * almostDist2Weight, __global uchar * buffer,
+__kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step, int src_offset,
+                                   __global sample_t * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                   __global const int * almostDist2Weight, __global sample_t * buffer,
                                    int almostTemplateWindowSizeSqBinShift)
 {
     int block_x = get_group_id(0), nblocks_x = get_num_groups(0);

From 9f7cac8c5933df74bf953227368b1bcd181d1b12 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Sun, 1 Mar 2015 21:01:57 +0100
Subject: [PATCH 12/40] Addtion of test cases for CV_8UC3

---
 modules/photo/test/ocl/test_denoising.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index cb2d74f85..48efc8ab5 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -87,7 +87,7 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
     }
 }
 
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2, 3), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored, Combine(Values(3, 4), Bool()));
 
 } } // namespace cvtest::ocl

From a9ff335a8923c92e9dc86ddac3571aeaae6f0fbf Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Sun, 1 Mar 2015 22:21:36 +0100
Subject: [PATCH 13/40] Added OpenCL support for FastNlMeansDenoisingAbs

---
 modules/photo/src/denoising.cpp                |  6 +++++-
 .../src/fast_nlmeans_denoising_opencl.hpp      | 18 +++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 52065b5f6..3fe1f2b90 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -51,7 +51,7 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     Size src_size = _src.size();
     CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize))
+               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, false))
 
     Mat src = _src.getMat();
     _dst.create(src_size, src.type());
@@ -88,6 +88,10 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
                                   int  templateWindowSize, int searchWindowSize)
 {
     Size src_size = _src.size();
+    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
+               src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
+               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, true))
+
     Mat src = _src.getMat();
     _dst.create(src_size, src.type());
     Mat dst = _dst.getMat();
diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index cd7dde385..5e96533fb 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -30,7 +30,7 @@ static int divUp(int a, int b)
 
 template <typename FT>
 static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
-                                      int & almostTemplateWindowSizeSqBinShift)
+                                      int & almostTemplateWindowSizeSqBinShift, bool abs)
 {
     const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255;
     int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue;
@@ -48,15 +48,15 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
     FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
 
     const FT WEIGHT_THRESHOLD = 1e-3f;
-    int maxDist = 255 * 255 * cn;
+    int maxDist = abs ? 255 * cn : 255 * 255 * cn;
     int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
     FT den = 1.0f / (h * h * cn);
 
     almostDist2Weight.create(1, almostMaxDist, CV_32SC1);
 
     ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
-                  format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                  format("-D OP_CALC_WEIGHTS -D FT=%s%s%s", ocl::typeToStr(depth),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "", abs ? " -D ABS" : ""));
     if (k.empty())
         return false;
 
@@ -68,7 +68,7 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
 }
 
 static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
-                                     int templateWindowSize, int searchWindowSize)
+                                     int templateWindowSize, int searchWindowSize, bool abs)
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
@@ -89,21 +89,21 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
                          " -D sample_t=%s -D pixel_t=%s -D int_t=%s"
                          " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
-                         " -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s",
+                         " -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s%s",
                          templateWindowSize, searchWindowSize,
                          ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
                          BLOCK_COLS, BLOCK_ROWS,
                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
                          ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), type == CV_8UC3 ? 4 : cn,
-                         ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]));
+                         ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]), abs ? " -D ABS" : "");
 
     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
     if (k.empty())
         return false;
 
     UMat almostDist2Weight;
-    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn,
-                                   almostTemplateWindowSizeSqBinShift))
+    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize,
+                                          h, cn, almostTemplateWindowSizeSqBinShift, abs))
         return false;
     CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
 

From 3bde9e93651a0d2f388ba3b5be7e3c5d9de9820c Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Sun, 1 Mar 2015 22:22:09 +0100
Subject: [PATCH 14/40] Added test cases

---
 modules/photo/test/ocl/test_denoising.cpp | 51 +++++++++++++++++------
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index 48efc8ab5..30dc680c8 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -13,11 +13,11 @@
 namespace cvtest {
 namespace ocl {
 
-PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
+PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
 {
     int cn, templateWindowSize, searchWindowSize;
     float h;
-    bool use_roi;
+    bool use_roi, use_image;
 
     TEST_DECLARE_INPUT_PARAMETER(src);
     TEST_DECLARE_OUTPUT_PARAMETER(dst);
@@ -26,6 +26,7 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
     {
         cn = GET_PARAM(0);
         use_roi = GET_PARAM(1);
+        use_image = GET_PARAM(2);
 
         templateWindowSize = 7;
         searchWindowSize = 21;
@@ -34,20 +35,27 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool)
 
     virtual void generateTestData()
     {
+        const int type = CV_8UC(cn);
         Mat image;
-        if (cn == 1)
-        {
-            image = readImage("denoising/lena_noised_gaussian_sigma=10.png", IMREAD_GRAYSCALE);
+
+        if (use_image) {
+            image = readImage("denoising/lena_noised_gaussian_sigma=10.png",
+                                  cn == 1 ? IMREAD_GRAYSCALE : IMREAD_COLOR);
             ASSERT_FALSE(image.empty());
         }
 
-        const int type = CV_8UC(cn);
-
-        Size roiSize = cn == 1 ? image.size() : randomSize(1, MAX_VALUE);
+        Size roiSize = use_image ? image.size() : randomSize(1, MAX_VALUE);
         Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
-        if (cn == 1)
-            image.copyTo(src_roi);
+        if (use_image) {
+            ASSERT_TRUE(cn == 1 || cn == 2 || cn == 3);
+            if (cn == 2) {
+                int from_to[] = { 0,0, 1,1 };
+                src_roi.create(roiSize, type);
+                mixChannels(&image, 1, &src_roi, 1, from_to, 2);
+            }
+            else image.copyTo(src_roi);
+        }
 
         Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255);
@@ -72,6 +80,21 @@ OCL_TEST_P(FastNlMeansDenoising, Mat)
     }
 }
 
+typedef FastNlMeansDenoisingTestBase FastNlMeansDenoisingAbs;
+
+OCL_TEST_P(FastNlMeansDenoisingAbs, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
+        OCL_OFF(cv::fastNlMeansDenoisingAbs(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoisingAbs(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1);
+    }
+}
+
 typedef FastNlMeansDenoisingTestBase FastNlMeansDenoisingColored;
 
 OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
@@ -87,8 +110,12 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
     }
 }
 
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2, 3), Bool()));
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored, Combine(Values(3, 4), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
+                            Combine(Values(1, 2, 3), Bool(), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingAbs,
+                            Combine(Values(1, 2, 3), Bool(), Bool()));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
+                            Combine(Values(3, 4), Bool(), Values(false)));
 
 } } // namespace cvtest::ocl
 

From 73663dcdd1f0f06a0567f266c4f9ebeb9b74a2b2 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 2 Mar 2015 03:29:17 +0100
Subject: [PATCH 15/40] Added support for 16-bit input

---
 .../src/fast_nlmeans_denoising_opencl.hpp     | 57 ++++++++++++-------
 modules/photo/src/opencl/nlmeans.cl           | 31 ++++++----
 2 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index 5e96533fb..a88b5cfd7 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -28,12 +28,14 @@ static int divUp(int a, int b)
     return (a + b - 1) / b;
 }
 
-template <typename FT>
+template <typename FT, typename ST, typename WT>
 static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
                                       int & almostTemplateWindowSizeSqBinShift, bool abs)
 {
-    const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255;
-    int fixedPointMult = std::numeric_limits<int>::max() / maxEstimateSumValue;
+    const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
+        std::numeric_limits<ST>::max();
+    int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
+                                           std::numeric_limits<int>::max());
     int depth = DataType<FT>::depth;
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
@@ -48,7 +50,8 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
     FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
 
     const FT WEIGHT_THRESHOLD = 1e-3f;
-    int maxDist = abs ? 255 * cn : 255 * 255 * cn;
+    int maxDist = abs ? std::numeric_limits<ST>::max() * cn :
+        std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
     int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
     FT den = 1.0f / (h * h * cn);
 
@@ -74,7 +77,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
     Size size = _src.size();
 
-    if ( type != CV_8UC1 && type != CV_8UC2 && type != CV_8UC3 )
+    if (cn != 1 && cn != 2 && cn != 3 && depth != CV_8U && (!abs || depth != CV_16U))
         return false;
 
     int templateWindowHalfWize = templateWindowSize / 2;
@@ -84,45 +87,60 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
     int almostTemplateWindowSizeSqBinShift = -1;
 
-    char cvt[2][40];
+    char buf[4][40];
     String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
                          " -D sample_t=%s -D pixel_t=%s -D int_t=%s"
+                         " -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
                          " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
                          " -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s%s",
                          templateWindowSize, searchWindowSize,
                          ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
+                         depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
+                         (sprintf(buf[0], "long%d", cn), buf[0]),
+                         depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
+                         (sprintf(buf[1], "convert_long%d", cn), buf[1]),
                          BLOCK_COLS, BLOCK_ROWS,
                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
-                         ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), type == CV_8UC3 ? 4 : cn,
-                         ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1]), abs ? " -D ABS" : "");
+                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn == 3 ? 4 : cn,
+                         ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : "");
 
     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
     if (k.empty())
         return false;
 
     UMat almostDist2Weight;
-    if (!ocl_calcAlmostDist2Weight<float>(almostDist2Weight, searchWindowSize, templateWindowSize,
-                                          h, cn, almostTemplateWindowSizeSqBinShift, abs))
+    if ((depth == CV_8U &&
+         !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
+                                                       searchWindowSize, templateWindowSize,
+                                                       h, cn,
+                                                       almostTemplateWindowSizeSqBinShift,
+                                                       abs)) ||
+        (depth == CV_16U &&
+         !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
+                                                          searchWindowSize, templateWindowSize,
+                                                          h, cn,
+                                                          almostTemplateWindowSizeSqBinShift,
+                                                          abs)))
         return false;
     CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
 
     UMat srcex;
     int borderSize = searchWindowHalfSize + templateWindowHalfWize;
-    if (type == CV_8UC3) {
-        Mat src_rgb = _src.getMat(), src_rgba(size, CV_8UC4);
+    if (cn == 3) {
+        UMat tmp(size, CV_MAKE_TYPE(depth, 4));
         int from_to[] = { 0,0, 1,1, 2,2 };
-        mixChannels(&src_rgb, 1, &src_rgba, 1, from_to, 3);
-        copyMakeBorder(src_rgba, srcex,
-                       borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+        mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, tmp), from_to, 3);
+        copyMakeBorder(tmp, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
     }
     else
         copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
 
     _dst.create(size, type);
     UMat dst;
-    if (type == CV_8UC3)
-        dst.create(size, CV_8UC4);
+    if (cn == 3)
+        dst.create(size, CV_MAKE_TYPE(depth, 4));
     else
         dst = _dst.getUMat();
 
@@ -139,10 +157,9 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
     if (!k.run(2, globalsize, localsize, false)) return false;
 
-    if (type == CV_8UC3) {
-        Mat dst_rgba = dst.getMat(ACCESS_READ), dst_rgb = _dst.getMat();
+    if (cn == 3) {
         int from_to[] = { 0,0, 1,1, 2,2 };
-        mixChannels(&dst_rgba, 1, &dst_rgb, 1, from_to, 3);
+        mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
     }
 
     return true;
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl
index c48adda0b..3a104c42a 100644
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -206,22 +206,23 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off
 inline void convolveWindow(__global const sample_t * src, int src_step, int src_offset,
                            __local int * dists, __global const int * almostDist2Weight,
                            __global sample_t * dst, int dst_step, int dst_offset,
-                           int y, int x, int id, __local int * weights_local,
-                           __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
+                           int y, int x, int id, __local weight_t * weights_local,
+                           __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
 {
-    int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0;
-    int_t weighted_sum = (int_t)(0);
+    int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
+    weight_t weights = 0;
+    sum_t weighted_sum = (sum_t)(0);
 
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
         int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
-        int_t src_value = convert_int_t(*(__global const pixel_t *)(src + src_index));
+        sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
 
         int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
         int weight = almostDist2Weight[almostAvgDist];
 
-        weights += weight;
-        weighted_sum += (int_t)(weight) * src_value;
+        weights += (weight_t)weight;
+        weighted_sum += (sum_t)(weight) * src_value;
     }
 
     weights_local[id] = weights;
@@ -242,11 +243,11 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_
     if (id == 0)
     {
         int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
-        int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
+        sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
             weighted_sum_local[2] + weighted_sum_local[3];
-        int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
+        weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
 
-        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (int_t)(weights_local_0));
+        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)(weights_local_0));
     }
 }
 
@@ -259,8 +260,9 @@ __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step,
     int block_y = get_group_id(1);
     int id = get_local_id(0), first;
 
-    __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE];
-    __local int_t weighted_sum[CTA_SIZE];
+    __local int dists[SEARCH_SIZE_SQ];
+    __local weight_t weights[CTA_SIZE];
+    __local sum_t weighted_sum[CTA_SIZE];
 
     int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols);
     int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows);
@@ -271,6 +273,11 @@ __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step,
     __global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int));
     __global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;
 
+    src_step /= sizeof(sample_t);
+    src_offset /= sizeof(sample_t);
+    dst_step /= sizeof(sample_t);
+    dst_offset /= sizeof(sample_t);
+
     for (int y = y0; y < y1; ++y)
         for (int x = x0; x < x1; ++x)
         {

From 50bb14a0a8642ffdf71969c78226ddd236bf97b9 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 2 Mar 2015 15:48:00 +0100
Subject: [PATCH 16/40] Avoiding unnecessary copy by creating borders in place
 after RGB -> RGBA conversion

---
 modules/photo/src/fast_nlmeans_denoising_opencl.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index a88b5cfd7..9c0e40401 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -129,10 +129,12 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     UMat srcex;
     int borderSize = searchWindowHalfSize + templateWindowHalfWize;
     if (cn == 3) {
-        UMat tmp(size, CV_MAKE_TYPE(depth, 4));
+        srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
+        UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
         int from_to[] = { 0,0, 1,1, 2,2 };
-        mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, tmp), from_to, 3);
-        copyMakeBorder(tmp, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
+        mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
+        copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
+                       BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
     }
     else
         copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);

From 87760d13fbee8b005800c55246fa59a3e4cc8685 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 2 Mar 2015 22:33:14 +0100
Subject: [PATCH 17/40] Cleanup and addition of 4-component support for
 ocl_fastNlMeansDenoising

---
 .../src/fast_nlmeans_denoising_opencl.hpp     |  6 ++--
 modules/photo/src/opencl/nlmeans.cl           | 28 +++++++++++--------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index 9c0e40401..41264045c 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -77,7 +77,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
     Size size = _src.size();
 
-    if (cn != 1 && cn != 2 && cn != 3 && depth != CV_8U && (!abs || depth != CV_16U))
+    if (cn != 1 && cn != 2 && cn != 3 && cn != 4 && depth != CV_8U && (!abs || depth != CV_16U))
         return false;
 
     int templateWindowHalfWize = templateWindowSize / 2;
@@ -93,7 +93,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
                          " -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
                          " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
-                         " -D convert_int_t=%s -D cn=%d -D convert_pixel_t=%s%s",
+                         " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
                          templateWindowSize, searchWindowSize,
                          ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
                          depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
@@ -103,7 +103,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
                          (sprintf(buf[1], "convert_long%d", cn), buf[1]),
                          BLOCK_COLS, BLOCK_ROWS,
                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
-                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn == 3 ? 4 : cn,
+                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn, cn == 3 ? 4 : cn,
                          ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : "");
 
     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl
index 3a104c42a..91b012354 100644
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -60,8 +60,10 @@ inline int calcDist(pixel_t a, pixel_t b)
     return retval;
 #elif cn == 2
     return retval.x + retval.y;
-#elif cn == 3 || cn == 4       /* A is ignored */
+#elif cn == 3
     return retval.x + retval.y + retval.z;
+#elif cn == 4
+    return retval.x + retval.y + retval.z + retval.w;
 #else
 #error "cn should be either 1, 2, 3 or 4"
 #endif
@@ -83,8 +85,10 @@ inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_v
     return retval;
 #elif cn == 2
     return retval.x + retval.y;
-#elif cn == 3 || cn == 4        /* A is ignored */
+#elif cn == 3
     return retval.x + retval.y + retval.z;
+#elif cn == 4
+    return retval.x + retval.y + retval.z + retval.w;
 #else
 #error "cn should be either 1, 2, 3 or 4"
 #endif
@@ -106,8 +110,8 @@ inline void calcFirstElementInRow(__global const sample_t * src, int src_step, i
         int dist = 0, value;
 
         __global const pixel_t * src_template = (__global const pixel_t *)(src +
-            mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
-        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
+            mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
         __global int * col_dists_current = col_dists + i * TEMPLATE_SIZE;
 
         #pragma unroll
@@ -148,9 +152,9 @@ inline void calcElementInFirstRow(__global const sample_t * src, int src_step, i
 
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
-        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset)));
+        __global const pixel_t * src_current = (__global const pixel_t *)(src + mad24(y, src_step, mad24(psz, x, src_offset)));
         __global const pixel_t * src_template = (__global const pixel_t *)(src +
-            mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset)));
+            mad24(sy + i / SEARCH_SIZE, src_step, mad24(psz, sx + i % SEARCH_SIZE, src_offset)));
         __global int * col_dists_current = col_dists + TEMPLATE_SIZE * i;
 
         int col_dist = 0;
@@ -178,8 +182,8 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off
     int sy_up = y - TEMPLATE_SIZE2 - 1;
     int sy_down = y + TEMPLATE_SIZE2;
 
-    pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset)));
-    pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset)));
+    pixel_t up_value = *(__global const pixel_t *)(src + mad24(sy_up, src_step, mad24(psz, sx, src_offset)));
+    pixel_t down_value = *(__global const pixel_t *)(src + mad24(sy_down, src_step, mad24(psz, sx, src_offset)));
 
     sx -= SEARCH_SIZE2;
     sy_up -= SEARCH_SIZE2;
@@ -189,8 +193,8 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off
     {
         int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE;
 
-        pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset)));
-        pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset)));
+        pixel_t up_value_t = *(__global const pixel_t *)(src + mad24(sy_up + wy, src_step, mad24(psz, sx + wx, src_offset)));
+        pixel_t down_value_t = *(__global const pixel_t *)(src + mad24(sy_down + wy, src_step, mad24(psz, sx + wx, src_offset)));
 
         __global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first);
         __global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i);
@@ -215,7 +219,7 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_
 
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
-        int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset));
+        int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, psz, src_offset));
         sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
 
         int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
@@ -242,7 +246,7 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_
 
     if (id == 0)
     {
-        int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset));
+        int dst_index = mad24(y, dst_step, mad24(psz, x, dst_offset));
         sum_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] +
             weighted_sum_local[2] + weighted_sum_local[3];
         weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];

From ae08884854a7b46db96eef489b6a943d1bb04f56 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 3 Mar 2015 01:19:34 +0100
Subject: [PATCH 18/40] Added support for 4-component input for
 fastNlMeansDenoising[Multi][Abs]

---
 modules/photo/src/denoising.cpp               | 41 ++++++++++++++--
 .../src/fast_nlmeans_denoising_invoker.hpp    |  2 +-
 ...fast_nlmeans_denoising_invoker_commons.hpp | 48 +++++++++++++++++++
 .../fast_nlmeans_multi_denoising_invoker.hpp  |  2 +-
 4 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 3fe1f2b90..b41f83ec9 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -78,9 +78,14 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
                     FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, and CV_8UC3 are supported");
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
     }
 }
 
@@ -112,6 +117,11 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
                 FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
         case CV_16U:
             parallel_for_(cv::Range(0, src.rows),
                 FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs>(
@@ -127,9 +137,14 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
                 FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
+        case CV_16UC4:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_16U, CV_16UC2, and CV_16UC3 are supported");
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
     }
 }
 
@@ -240,9 +255,15 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, and CV_8UC3 are supported");
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
     }
 }
 
@@ -280,6 +301,12 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
         case CV_16U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs>(
@@ -298,9 +325,15 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
+        case CV_16UC4:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_16U, CV_16UC2, and CV_16UC3 are supported");
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
     }
 }
 
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 468fa82f7..01588b03d 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -227,7 +227,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
             }
 
             // calc weights
-            IT estimation[3], weights_sum = 0;
+            IT estimation[pixelInfo<T>::channels], weights_sum = 0;
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
 
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index d55d93ce7..d77ca3e1f 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -110,6 +110,18 @@ class DistAbs
         }
     };
 
+    template <typename ET, typename IT> struct calcDist_<Vec<ET, 4>, IT>
+    {
+        static inline IT f(const Vec<ET, 4> a, const Vec<ET, 4> b)
+        {
+            return
+                std::abs((IT)(a[0]-b[0])) +
+                std::abs((IT)(a[1]-b[1])) +
+                std::abs((IT)(a[2]-b[2])) +
+                std::abs((IT)(a[3]-b[3]));
+        }
+    };
+
 public:
     template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
     {
@@ -172,6 +184,18 @@ class DistSquared
         }
     };
 
+    template <typename ET, typename IT> struct calcDist_<Vec<ET, 4>, IT>
+    {
+        static inline IT f(const Vec<ET, 4> a, const Vec<ET, 4> b)
+        {
+            return
+                (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
+                (IT)(a[1]-b[1])*(IT)(a[1]-b[1]) +
+                (IT)(a[2]-b[2])*(IT)(a[2]-b[2]) +
+                (IT)(a[3]-b[3])*(IT)(a[3]-b[3]);
+        }
+    };
+
     template <typename T, typename IT> struct calcUpDownDist_
     {
         static inline IT f(T a_up, T a_down, T b_up, T b_down)
@@ -254,6 +278,17 @@ template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT>
     }
 };
 
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 4>, IT>
+{
+    static inline void f(IT* estimation, IT weight, Vec<ET, 4> p)
+    {
+        estimation[0] += weight * p[0];
+        estimation[1] += weight * p[1];
+        estimation[2] += weight * p[2];
+        estimation[3] += weight * p[3];
+    }
+};
+
 template <typename T, typename IT>
 static inline void incWithWeight(IT* estimation, IT weight, T p)
 {
@@ -291,6 +326,19 @@ template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 3>, IT
     }
 };
 
+template <typename ET, typename IT> struct saturateCastFromArray_<Vec<ET, 4>, IT>
+{
+    static inline Vec<ET, 4> f(IT* estimation)
+    {
+        Vec<ET, 4> res;
+        res[0] = saturate_cast<ET>(estimation[0]);
+        res[1] = saturate_cast<ET>(estimation[1]);
+        res[2] = saturate_cast<ET>(estimation[2]);
+        res[3] = saturate_cast<ET>(estimation[3]);
+        return res;
+    }
+};
+
 template <typename T, typename IT> static inline T saturateCastFromArray(IT* estimation)
 {
     return saturateCastFromArray_<T, IT>::f(estimation);
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 0a2bdd739..eb2078643 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -249,7 +249,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
             // calc weights
             IT weights_sum = 0;
 
-            IT estimation[3];
+            IT estimation[pixelInfo<T>::channels];
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
 

From ac6771f975144c00c153431687dce6ecc45303cf Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 3 Mar 2015 01:20:33 +0100
Subject: [PATCH 19/40] Added test cases

---
 modules/photo/test/ocl/test_denoising.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index 30dc680c8..4aba4b51e 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -48,12 +48,17 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
         Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
         if (use_image) {
-            ASSERT_TRUE(cn == 1 || cn == 2 || cn == 3);
+            ASSERT_TRUE(cn == 1 || cn == 2 || cn == 3 || cn == 4);
             if (cn == 2) {
                 int from_to[] = { 0,0, 1,1 };
                 src_roi.create(roiSize, type);
                 mixChannels(&image, 1, &src_roi, 1, from_to, 2);
             }
+            else if (cn == 4) {
+                int from_to[] = { 0,0, 1,1, 2,2, 1,3};
+                src_roi.create(roiSize, type);
+                mixChannels(&image, 1, &src_roi, 1, from_to, 4);
+            }
             else image.copyTo(src_roi);
         }
 
@@ -111,9 +116,9 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
 }
 
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
-                            Combine(Values(1, 2, 3), Bool(), Bool()));
+                            Combine(Values(1, 2, 3, 4), Bool(), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingAbs,
-                            Combine(Values(1, 2, 3), Bool(), Bool()));
+                            Combine(Values(1, 2, 3, 4), Bool(), Bool()));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
                             Combine(Values(3, 4), Bool(), Values(false)));
 

From d56d04e41bfc88dd26aa4b9799e0f6922266183a Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 3 Mar 2015 01:34:29 +0100
Subject: [PATCH 20/40] Updated documentation

---
 modules/photo/include/opencv2/photo.hpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index c25a35e6d..446e81750 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -119,7 +119,7 @@ CV_EXPORTS_W void inpaint( InputArray src, InputArray inpaintMask,
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
 optimizations. Noise expected to be a gaussian white noise
 
-@param src Input 8-bit 1-channel, 2-channel or 3-channel image.
+@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
 @param dst Output image with the same size and type as src .
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@@ -144,7 +144,7 @@ with several computational optimizations. Noise expected to be a
 gaussian white noise. Uses squared sum of absolute value distances
 instead of sum of squared distances for weight calculation
 
-@param src Input 8-bit or 16-bit 1-channel, 2-channel or 3-channel image.
+@param src Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel or 4-channel image.
 @param dst Output image with the same size and type as src .
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@@ -190,8 +190,9 @@ captured in small period of time. For example video. This version of the functio
 images or for manual manipulation with colorspaces. For more details see
 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
 
-@param srcImgs Input 8-bit 1-channel, 2-channel or 3-channel images sequence. All images should
-have the same type and size.
+@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
+4-channel images sequence. All images should have the same type and
+size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
@@ -220,9 +221,9 @@ details see
 squared sum of absolute value distances instead of sum of squared
 distances for weight calculation
 
-@param srcImgs Input 8-bit or 16-bit 1-channel, 2-channel or 3-channel
-images sequence. All images should
-have the same type and size.
+@param srcImgs Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel
+or 4-channel images sequence. All images should have the same type and
+size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to

From 69eae13ff3f6115b6716c2dc1927b679580f9ced Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 3 Mar 2015 03:02:44 +0100
Subject: [PATCH 21/40] Changed pointers from sample_t * to uchar *. Rescaling
 psz accordingly.

---
 .../src/fast_nlmeans_denoising_opencl.hpp     |  7 +++--
 modules/photo/src/opencl/nlmeans.cl           | 29 ++++++++-----------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index 41264045c..2fa11a351 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -89,13 +89,13 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
 
     char buf[4][40];
     String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
-                         " -D sample_t=%s -D pixel_t=%s -D int_t=%s"
+                         " -D pixel_t=%s -D int_t=%s"
                          " -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
                          " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
                          " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
                          templateWindowSize, searchWindowSize,
-                         ocl::typeToStr(depth), ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
+                         ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
                          depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
                          depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
                          (sprintf(buf[0], "long%d", cn), buf[0]),
@@ -103,7 +103,8 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
                          (sprintf(buf[1], "convert_long%d", cn), buf[1]),
                          BLOCK_COLS, BLOCK_ROWS,
                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
-                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn, cn == 3 ? 4 : cn,
+                         ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
+                         (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
                          ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : "");
 
     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl
index 91b012354..11837a5fc 100644
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -97,7 +97,7 @@ inline int calcDistUpDown(pixel_t down_value, pixel_t down_value_t, pixel_t up_v
 
 #define COND if (x == 0 && y == 0)
 
-inline void calcFirstElementInRow(__global const sample_t * src, int src_step, int src_offset,
+inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset,
                                   __local int * dists, int y, int x, int id,
                                   __global int * col_dists, __global int * up_col_dists)
 {
@@ -129,8 +129,8 @@ inline void calcFirstElementInRow(__global const sample_t * src, int src_step, i
                 dist += value;
             }
 
-            src_current = (__global const pixel_t *)((__global const sample_t *)src_current + src_step);
-            src_template = (__global const pixel_t *)((__global const sample_t *)src_template + src_step);
+            src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
+            src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
         }
 
         #pragma unroll
@@ -142,7 +142,7 @@ inline void calcFirstElementInRow(__global const sample_t * src, int src_step, i
     }
 }
 
-inline void calcElementInFirstRow(__global const sample_t * src, int src_step, int src_offset,
+inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset,
                                   __local int * dists, int y, int x0, int x, int id, int first,
                                   __global int * col_dists, __global int * up_col_dists)
 {
@@ -164,8 +164,8 @@ inline void calcElementInFirstRow(__global const sample_t * src, int src_step, i
         {
             col_dist += calcDist(src_current[0], src_template[0]);
 
-            src_current = (__global const pixel_t *)((__global const sample_t *)src_current + src_step);
-            src_template = (__global const pixel_t *)((__global const sample_t *)src_template + src_step);
+            src_current = (__global const pixel_t *)((__global const uchar *)src_current + src_step);
+            src_template = (__global const pixel_t *)((__global const uchar *)src_template + src_step);
         }
 
         dists[i] += col_dist - col_dists_current[first];
@@ -174,7 +174,7 @@ inline void calcElementInFirstRow(__global const sample_t * src, int src_step, i
     }
 }
 
-inline void calcElement(__global const sample_t * src, int src_step, int src_offset,
+inline void calcElement(__global const uchar * src, int src_step, int src_offset,
                         __local int * dists, int y, int x0, int x, int id, int first,
                         __global int * col_dists, __global int * up_col_dists)
 {
@@ -207,9 +207,9 @@ inline void calcElement(__global const sample_t * src, int src_step, int src_off
     }
 }
 
-inline void convolveWindow(__global const sample_t * src, int src_step, int src_offset,
+inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
                            __local int * dists, __global const int * almostDist2Weight,
-                           __global sample_t * dst, int dst_step, int dst_offset,
+                           __global uchar * dst, int dst_step, int dst_offset,
                            int y, int x, int id, __local weight_t * weights_local,
                            __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
 {
@@ -255,9 +255,9 @@ inline void convolveWindow(__global const sample_t * src, int src_step, int src_
     }
 }
 
-__kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step, int src_offset,
-                                   __global sample_t * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                                   __global const int * almostDist2Weight, __global sample_t * buffer,
+__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
+                                   __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
+                                   __global const int * almostDist2Weight, __global uchar * buffer,
                                    int almostTemplateWindowSizeSqBinShift)
 {
     int block_x = get_group_id(0), nblocks_x = get_num_groups(0);
@@ -277,11 +277,6 @@ __kernel void fastNlMeansDenoising(__global const sample_t * src, int src_step,
     __global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int));
     __global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE;
 
-    src_step /= sizeof(sample_t);
-    src_offset /= sizeof(sample_t);
-    dst_step /= sizeof(sample_t);
-    dst_offset /= sizeof(sample_t);
-
     for (int y = y0; y < y1; ++y)
         for (int x = x0; x < x1; ++x)
         {

From 305cff36e2c5a334821bd6e40eddd16ba304e6fe Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 5 Mar 2015 13:36:42 +0100
Subject: [PATCH 22/40] Changed from IT to int for distance calculation

---
 .../src/fast_nlmeans_denoising_invoker.hpp    |  50 +++----
 ...fast_nlmeans_denoising_invoker_commons.hpp | 124 +++++++++---------
 .../fast_nlmeans_multi_denoising_invoker.hpp  |  54 ++++----
 3 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 01588b03d..2ebf76af4 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -80,15 +80,15 @@ private:
     std::vector<IT> almost_dist2weight_;
 
     void calcDistSumsForFirstElementInRow(
-        int i, Array2d<IT>& dist_sums,
-        Array3d<IT>& col_dist_sums,
-        Array3d<IT>& up_col_dist_sums) const;
+        int i, Array2d<int>& dist_sums,
+        Array3d<int>& col_dist_sums,
+        Array3d<int>& up_col_dist_sums) const;
 
     void calcDistSumsForElementInFirstRow(
         int i, int j, int first_col_num,
-        Array2d<IT>& dist_sums,
-        Array3d<IT>& col_dist_sums,
-        Array3d<IT>& up_col_dist_sums) const;
+        Array2d<int>& dist_sums,
+        Array3d<int>& col_dist_sums,
+        Array3d<int>& up_col_dist_sums) const;
 };
 
 inline int getNearestPowerOf2(int value)
@@ -128,8 +128,8 @@ FastNlMeansDenoisingInvoker<T, IT, UIT, D>::FastNlMeansDenoisingInvoker(
     almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq);
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
-    IT max_dist = D::template maxDist<T, IT>();
-    size_t almost_max_dist = (size_t)(max_dist / almost_dist2actual_dist_multiplier + 1);
+    int max_dist = D::template maxDist<T>();
+    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight_.resize(almost_max_dist);
 
     const double WEIGHT_THRESHOLD = 0.001;
@@ -156,14 +156,14 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
     int row_to = range.end - 1;
 
     // sums of cols anf rows for current pixel p
-    Array2d<IT> dist_sums(search_window_size_, search_window_size_);
+    Array2d<int> dist_sums(search_window_size_, search_window_size_);
 
     // for lazy calc optimization (sum of cols for current pixel)
-    Array3d<IT> col_dist_sums(template_window_size_, search_window_size_, search_window_size_);
+    Array3d<int> col_dist_sums(template_window_size_, search_window_size_, search_window_size_);
 
     int first_col_num = -1;
     // last elements of column sum (for each element in row)
-    Array3d<IT> up_col_dist_sums(src_.cols, search_window_size_, search_window_size_);
+    Array3d<int> up_col_dist_sums(src_.cols, search_window_size_, search_window_size_);
 
     for (int i = row_from; i <= row_to; i++)
     {
@@ -202,9 +202,9 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
 
                     for (int y = 0; y < search_window_size; y++)
                     {
-                        IT * dist_sums_row = dist_sums.row_ptr(y);
-                        IT * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y);
-                        IT * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y);
+                        int * dist_sums_row = dist_sums.row_ptr(y);
+                        int * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y);
+                        int * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y);
 
                         const T * b_up_ptr = extended_src_.ptr<T>(start_by - template_window_half_size_ - 1 + y);
                         const T * b_down_ptr = extended_src_.ptr<T>(start_by + template_window_half_size_ + y);
@@ -215,7 +215,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
                             dist_sums_row[x] -= col_dist_sums_row[x];
 
                             int bx = start_bx + x;
-                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + D::template calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
+                            col_dist_sums_row[x] = up_col_dist_sums_row[x] + D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]);
 
                             dist_sums_row[x] += col_dist_sums_row[x];
                             up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -234,10 +234,10 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
             for (int y = 0; y < search_window_size_; y++)
             {
                 const T* cur_row_ptr = extended_src_.ptr<T>(border_size_ + search_window_y + y);
-                IT* dist_sums_row = dist_sums.row_ptr(y);
+                int* dist_sums_row = dist_sums.row_ptr(y);
                 for (int x = 0; x < search_window_size_; x++)
                 {
-                    size_t almostAvgDist = (size_t)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_);
+                    int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
                     IT weight = almost_dist2weight_[almostAvgDist];
                     weights_sum += weight;
 
@@ -257,9 +257,9 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
 template <typename T, typename IT, typename UIT, typename D>
 inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElementInRow(
     int i,
-    Array2d<IT>& dist_sums,
-    Array3d<IT>& col_dist_sums,
-    Array3d<IT>& up_col_dist_sums) const
+    Array2d<int>& dist_sums,
+    Array3d<int>& col_dist_sums,
+    Array3d<int>& up_col_dist_sums) const
 {
     int j = 0;
 
@@ -276,7 +276,7 @@ inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElem
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                 {
-                    int dist = D::template calcDist<T, IT>(extended_src_,
+                    int dist = D::template calcDist<T>(extended_src_,
                         border_size_ + i + ty, border_size_ + j + tx,
                         border_size_ + start_y + ty, border_size_ + start_x + tx);
 
@@ -291,9 +291,9 @@ inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElem
 template <typename T, typename IT, typename UIT, typename D>
 inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num,
-    Array2d<IT>& dist_sums,
-    Array3d<IT>& col_dist_sums,
-    Array3d<IT>& up_col_dist_sums) const
+    Array2d<int>& dist_sums,
+    Array3d<int>& col_dist_sums,
+    Array3d<int>& up_col_dist_sums) const
 {
     int ay = border_size_ + i;
     int ax = border_size_ + j + template_window_half_size_;
@@ -312,7 +312,7 @@ inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElementIn
             int by = start_by + y;
             int bx = start_bx + x;
             for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
-                col_dist_sums[new_last_col_num][y][x] += D::template calcDist<T,IT>(extended_src_, ay + ty, ax, by + ty, bx);
+                col_dist_sums[new_last_col_num][y][x] += D::template calcDist<T>(extended_src_, ay + ty, ax, by + ty, bx);
 
             dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x];
             up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x];
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index d77ca3e1f..dbb4c5eb3 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -83,63 +83,63 @@ template <typename T> struct pixelInfo: public pixelInfo_<T>
 
 class DistAbs
 {
-    template <typename T, typename IT> struct calcDist_
+    template <typename T> struct calcDist_
     {
-        static inline IT f(const T a, const T b)
+        static inline int f(const T a, const T b)
         {
-            return std::abs((IT)(a-b));
+            return std::abs((int)(a-b));
         }
     };
 
-    template <typename ET, typename IT> struct calcDist_<Vec<ET, 2>, IT>
+    template <typename ET> struct calcDist_<Vec<ET, 2> >
     {
-        static inline IT f(const Vec<ET, 2> a, const Vec<ET, 2> b)
+        static inline int f(const Vec<ET, 2> a, const Vec<ET, 2> b)
         {
-            return std::abs((IT)(a[0]-b[0])) + std::abs((IT)(a[1]-b[1]));
+            return std::abs((int)(a[0]-b[0])) + std::abs((int)(a[1]-b[1]));
         }
     };
 
-    template <typename ET, typename IT> struct calcDist_<Vec<ET, 3>, IT>
+    template <typename ET> struct calcDist_<Vec<ET, 3> >
     {
-        static inline IT f(const Vec<ET, 3> a, const Vec<ET, 3> b)
+        static inline int f(const Vec<ET, 3> a, const Vec<ET, 3> b)
         {
             return
-                std::abs((IT)(a[0]-b[0])) +
-                std::abs((IT)(a[1]-b[1])) +
-                std::abs((IT)(a[2]-b[2]));
+                std::abs((int)(a[0]-b[0])) +
+                std::abs((int)(a[1]-b[1])) +
+                std::abs((int)(a[2]-b[2]));
         }
     };
 
-    template <typename ET, typename IT> struct calcDist_<Vec<ET, 4>, IT>
+    template <typename ET> struct calcDist_<Vec<ET, 4> >
     {
-        static inline IT f(const Vec<ET, 4> a, const Vec<ET, 4> b)
+        static inline int f(const Vec<ET, 4> a, const Vec<ET, 4> b)
         {
             return
-                std::abs((IT)(a[0]-b[0])) +
-                std::abs((IT)(a[1]-b[1])) +
-                std::abs((IT)(a[2]-b[2])) +
-                std::abs((IT)(a[3]-b[3]));
+                std::abs((int)(a[0]-b[0])) +
+                std::abs((int)(a[1]-b[1])) +
+                std::abs((int)(a[2]-b[2])) +
+                std::abs((int)(a[3]-b[3]));
         }
     };
 
 public:
-    template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
+    template <typename T> static inline int calcDist(const T a, const T b)
     {
-        return calcDist_<T, IT>::f(a, b);
+        return calcDist_<T>::f(a, b);
     }
 
-    template <typename T, typename IT>
-    static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+    template <typename T>
+    static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
     {
         const T a = m.at<T>(i1, j1);
         const T b = m.at<T>(i2, j2);
-        return calcDist<T, IT>(a,b);
+        return calcDist<T>(a,b);
     }
 
-    template <typename T, typename IT>
-    static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+    template <typename T>
+    static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
     {
-        return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+        return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
     };
 
     template <typename T>
@@ -148,93 +148,93 @@ public:
         return std::exp(-dist*dist / (h * h * pixelInfo<T>::channels));
     }
 
-    template <typename T, typename IT>
+    template <typename T>
     static double maxDist()
     {
-        return (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::channels;
+        return (int)pixelInfo<T>::sampleMax() * pixelInfo<T>::channels;
     }
 };
 
 class DistSquared
 {
-    template <typename T, typename IT> struct calcDist_
+    template <typename T> struct calcDist_
     {
-        static inline IT f(const T a, const T b)
+        static inline int f(const T a, const T b)
         {
-            return (IT)(a-b) * (IT)(a-b);
+            return (int)(a-b) * (int)(a-b);
         }
     };
 
-    template <typename ET, typename IT> struct calcDist_<Vec<ET, 2>, IT>
+    template <typename ET> struct calcDist_<Vec<ET, 2> >
     {
-        static inline IT f(const Vec<ET, 2> a, const Vec<ET, 2> b)
+        static inline int f(const Vec<ET, 2> a, const Vec<ET, 2> b)
         {
-            return (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) + (IT)(a[1]-b[1])*(IT)(a[1]-b[1]);
+            return (int)(a[0]-b[0])*(int)(a[0]-b[0]) + (int)(a[1]-b[1])*(int)(a[1]-b[1]);
         }
     };
 
-    template <typename ET, typename IT> struct calcDist_<Vec<ET, 3>, IT>
+    template <typename ET> struct calcDist_<Vec<ET, 3> >
     {
-        static inline IT f(const Vec<ET, 3> a, const Vec<ET, 3> b)
+        static inline int f(const Vec<ET, 3> a, const Vec<ET, 3> b)
         {
             return
-                (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
-                (IT)(a[1]-b[1])*(IT)(a[1]-b[1]) +
-                (IT)(a[2]-b[2])*(IT)(a[2]-b[2]);
+                (int)(a[0]-b[0])*(int)(a[0]-b[0]) +
+                (int)(a[1]-b[1])*(int)(a[1]-b[1]) +
+                (int)(a[2]-b[2])*(int)(a[2]-b[2]);
         }
     };
 
-    template <typename ET, typename IT> struct calcDist_<Vec<ET, 4>, IT>
+    template <typename ET> struct calcDist_<Vec<ET, 4> >
     {
-        static inline IT f(const Vec<ET, 4> a, const Vec<ET, 4> b)
+        static inline int f(const Vec<ET, 4> a, const Vec<ET, 4> b)
         {
             return
-                (IT)(a[0]-b[0])*(IT)(a[0]-b[0]) +
-                (IT)(a[1]-b[1])*(IT)(a[1]-b[1]) +
-                (IT)(a[2]-b[2])*(IT)(a[2]-b[2]) +
-                (IT)(a[3]-b[3])*(IT)(a[3]-b[3]);
+                (int)(a[0]-b[0])*(int)(a[0]-b[0]) +
+                (int)(a[1]-b[1])*(int)(a[1]-b[1]) +
+                (int)(a[2]-b[2])*(int)(a[2]-b[2]) +
+                (int)(a[3]-b[3])*(int)(a[3]-b[3]);
         }
     };
 
-    template <typename T, typename IT> struct calcUpDownDist_
+    template <typename T> struct calcUpDownDist_
     {
-        static inline IT f(T a_up, T a_down, T b_up, T b_down)
+        static inline int f(T a_up, T a_down, T b_up, T b_down)
         {
-            IT A = a_down - b_down;
-            IT B = a_up - b_up;
+            int A = a_down - b_down;
+            int B = a_up - b_up;
             return (A-B)*(A+B);
         }
     };
 
-    template <typename ET, int n, typename IT> struct calcUpDownDist_<Vec<ET, n>, IT>
+    template <typename ET, int n> struct calcUpDownDist_<Vec<ET, n> >
     {
     private:
         typedef Vec<ET, n> T;
     public:
-        static inline IT f(T a_up, T a_down, T b_up, T b_down)
+        static inline int f(T a_up, T a_down, T b_up, T b_down)
         {
-            return calcDist<T, IT>(a_down, b_down) - calcDist<T, IT>(a_up, b_up);
+            return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
         }
     };
 
 public:
-    template <typename T, typename IT> static inline IT calcDist(const T a, const T b)
+    template <typename T> static inline int calcDist(const T a, const T b)
     {
-        return calcDist_<T, IT>::f(a, b);
+        return calcDist_<T>::f(a, b);
     }
 
-    template <typename T, typename IT>
-    static inline IT calcDist(const Mat& m, int i1, int j1, int i2, int j2)
+    template <typename T>
+    static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2)
     {
         const T a = m.at<T>(i1, j1);
         const T b = m.at<T>(i2, j2);
-        return calcDist<T, IT>(a,b);
+        return calcDist<T>(a,b);
     }
 
-    template <typename T, typename IT>
-    static inline IT calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
+    template <typename T>
+    static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down)
     {
-        return calcUpDownDist_<T, IT>::f(a_up, a_down, b_up, b_down);
+        return calcUpDownDist_<T>::f(a_up, a_down, b_up, b_down);
     };
 
     template <typename T>
@@ -243,11 +243,11 @@ public:
         return std::exp(-dist / (h * h * pixelInfo<T>::channels));
     }
 
-    template <typename T, typename IT>
+    template <typename T>
     static double maxDist()
     {
-        return (IT)pixelInfo<T>::sampleMax() * (IT)pixelInfo<T>::sampleMax() *
-            (IT)pixelInfo<T>::channels;
+        return (int)pixelInfo<T>::sampleMax() * (int)pixelInfo<T>::sampleMax() *
+            pixelInfo<T>::channels;
     }
 };
 
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index eb2078643..f1a334040 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -85,13 +85,13 @@ private:
     int almost_template_window_size_sq_bin_shift;
     std::vector<IT> almost_dist2weight;
 
-    void calcDistSumsForFirstElementInRow(int i, Array3d<IT>& dist_sums,
-                                          Array4d<IT>& col_dist_sums,
-                                          Array4d<IT>& up_col_dist_sums) const;
+    void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
+                                          Array4d<int>& col_dist_sums,
+                                          Array4d<int>& up_col_dist_sums) const;
 
     void calcDistSumsForElementInFirstRow(int i, int j, int first_col_num,
-                                          Array3d<IT>& dist_sums, Array4d<IT>& col_dist_sums,
-                                          Array4d<IT>& up_col_dist_sums) const;
+                                          Array3d<int>& dist_sums, Array4d<int>& col_dist_sums,
+                                          Array4d<int>& up_col_dist_sums) const;
 };
 
 template <typename T, typename IT, typename UIT, typename D>
@@ -139,8 +139,8 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::FastNlMeansMultiDenoisingInvoke
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
 
-    IT max_dist = D::template maxDist<T,IT>();
-    int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1);
+    int max_dist = D::template maxDist<T>();
+    int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight.resize(almost_max_dist);
 
     const double WEIGHT_THRESHOLD = 0.001;
@@ -166,13 +166,13 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
     int row_from = range.start;
     int row_to = range.end - 1;
 
-    Array3d<IT> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
+    Array3d<int> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
 
     // for lazy calc optimization
-    Array4d<IT> col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_);
+    Array4d<int> col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_);
 
     int first_col_num = -1;
-    Array4d<IT> up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_);
+    Array4d<int> up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_);
 
     for (int i = row_from; i <= row_to; i++)
     {
@@ -216,15 +216,15 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
                     for (int d = 0; d < temporal_window_size_; d++)
                     {
                         Mat cur_extended_src = extended_srcs_[d];
-                        Array2d<IT> cur_dist_sums = dist_sums[d];
-                        Array2d<IT> cur_col_dist_sums = col_dist_sums[first_col_num][d];
-                        Array2d<IT> cur_up_col_dist_sums = up_col_dist_sums[j][d];
+                        Array2d<int> cur_dist_sums = dist_sums[d];
+                        Array2d<int> cur_col_dist_sums = col_dist_sums[first_col_num][d];
+                        Array2d<int> cur_up_col_dist_sums = up_col_dist_sums[j][d];
                         for (int y = 0; y < search_window_size; y++)
                         {
-                            IT* dist_sums_row = cur_dist_sums.row_ptr(y);
+                            int* dist_sums_row = cur_dist_sums.row_ptr(y);
 
-                            IT* col_dist_sums_row = cur_col_dist_sums.row_ptr(y);
-                            IT* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y);
+                            int* col_dist_sums_row = cur_col_dist_sums.row_ptr(y);
+                            int* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y);
 
                             const T* b_up_ptr = cur_extended_src.ptr<T>(start_by - template_window_half_size_ - 1 + y);
                             const T* b_down_ptr = cur_extended_src.ptr<T>(start_by + template_window_half_size_ + y);
@@ -234,7 +234,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
                                 dist_sums_row[x] -= col_dist_sums_row[x];
 
                                 col_dist_sums_row[x] = up_col_dist_sums_row[x] +
-                                    D::template calcUpDownDist<T, IT>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
+                                    D::template calcUpDownDist<T>(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]);
 
                                 dist_sums_row[x] += col_dist_sums_row[x];
                                 up_col_dist_sums_row[x] = col_dist_sums_row[x];
@@ -260,11 +260,11 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
                 {
                     const T* cur_row_ptr = esrc_d.ptr<T>(border_size_ + search_window_y + y);
 
-                    IT* dist_sums_row = dist_sums.row_ptr(d, y);
+                    int* dist_sums_row = dist_sums.row_ptr(d, y);
 
                     for (int x = 0; x < search_window_size_; x++)
                     {
-                        size_t almostAvgDist = (size_t)(dist_sums_row[x] >> almost_template_window_size_sq_bin_shift);
+                        int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
 
                         IT weight =  almost_dist2weight[almostAvgDist];
                         weights_sum += weight;
@@ -286,7 +286,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
 
 template <typename T, typename IT, typename UIT, typename D>
 inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElementInRow(
-        int i, Array3d<IT>& dist_sums, Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
+        int i, Array3d<int>& dist_sums, Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
     int j = 0;
 
@@ -303,14 +303,14 @@ inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirs
                 int start_y = i + y - search_window_half_size_;
                 int start_x = j + x - search_window_half_size_;
 
-                IT* dist_sums_ptr = &dist_sums[d][y][x];
-                IT* col_dist_sums_ptr = &col_dist_sums[0][d][y][x];
+                int* dist_sums_ptr = &dist_sums[d][y][x];
+                int* col_dist_sums_ptr = &col_dist_sums[0][d][y][x];
                 int col_dist_sums_step = col_dist_sums.step_size(0);
                 for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++)
                 {
                     for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                     {
-                        IT dist = D::template calcDist<T, IT>(
+                        int dist = D::template calcDist<T>(
                                     main_extended_src_.at<T>(border_size_ + i + ty, border_size_ + j + tx),
                                     cur_extended_src.at<T>(border_size_ + start_y + ty, border_size_ + start_x + tx));
 
@@ -327,8 +327,8 @@ inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirs
 
 template <typename T, typename IT, typename UIT, typename D>
 inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElementInFirstRow(
-    int i, int j, int first_col_num, Array3d<IT>& dist_sums,
-    Array4d<IT>& col_dist_sums, Array4d<IT>& up_col_dist_sums) const
+    int i, int j, int first_col_num, Array3d<int>& dist_sums,
+    Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
     int ay = border_size_ + i;
     int ax = border_size_ + j + template_window_half_size_;
@@ -350,10 +350,10 @@ inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElem
                 int by = start_by + y;
                 int bx = start_bx + x;
 
-                IT* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
+                int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x];
                 for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++)
                 {
-                    *col_dist_sums_ptr += D::template calcDist<T, IT>(
+                    *col_dist_sums_ptr += D::template calcDist<T>(
                                 main_extended_src_.at<T>(ay + ty, ax),
                                 cur_extended_src.at<T>(by + ty, bx));
                 }

From 18be52c05b5d3167c937976c146a392675c828fc Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 5 Mar 2015 13:55:06 +0100
Subject: [PATCH 23/40] Changed LUTs from IT to int

---
 .../src/fast_nlmeans_denoising_invoker.hpp    | 13 +++++----
 ...fast_nlmeans_denoising_invoker_commons.hpp | 28 +++++++++----------
 .../fast_nlmeans_multi_denoising_invoker.hpp  | 13 +++++----
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 2ebf76af4..ec154fbe6 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -75,9 +75,9 @@ private:
     int template_window_half_size_;
     int search_window_half_size_;
 
-    IT fixed_point_mult_;
+    int fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift_;
-    std::vector<IT> almost_dist2weight_;
+    std::vector<int> almost_dist2weight_;
 
     void calcDistSumsForFirstElementInRow(
         int i, Array2d<int>& dist_sums,
@@ -119,7 +119,8 @@ FastNlMeansDenoisingInvoker<T, IT, UIT, D>::FastNlMeansDenoisingInvoker(
 
     const IT max_estimate_sum_value =
         (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
-    fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;
+    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
+                                          std::numeric_limits<int>::max());
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -136,7 +137,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT, D>::FastNlMeansDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
+        int weight = (int)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
 
@@ -238,8 +239,8 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
                 for (int x = 0; x < search_window_size_; x++)
                 {
                     int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
-                    IT weight = almost_dist2weight_[almostAvgDist];
-                    weights_sum += weight;
+                    int weight = almost_dist2weight_[almostAvgDist];
+                    weights_sum += (IT)weight;
 
                     T p = cur_row_ptr[border_size_ + search_window_x + x];
                     incWithWeight<T, IT>(estimation, weight, p);
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index dbb4c5eb3..4d66efe46 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -253,39 +253,39 @@ public:
 
 template <typename T, typename IT> struct incWithWeight_
 {
-    static inline void f(IT* estimation, IT weight, T p)
+    static inline void f(IT* estimation, int weight, T p)
     {
-        estimation[0] += weight * p;
+        estimation[0] += (IT)weight * p;
     }
 };
 
 template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT>
 {
-    static inline void f(IT* estimation, IT weight, Vec<ET, 2> p)
+    static inline void f(IT* estimation, int weight, Vec<ET, 2> p)
     {
-        estimation[0] += weight * p[0];
-        estimation[1] += weight * p[1];
+        estimation[0] += (IT)weight * p[0];
+        estimation[1] += (IT)weight * p[1];
     }
 };
 
 template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT>
 {
-    static inline void f(IT* estimation, IT weight, Vec<ET, 3> p)
+    static inline void f(IT* estimation, int weight, Vec<ET, 3> p)
     {
-        estimation[0] += weight * p[0];
-        estimation[1] += weight * p[1];
-        estimation[2] += weight * p[2];
+        estimation[0] += (IT)weight * p[0];
+        estimation[1] += (IT)weight * p[1];
+        estimation[2] += (IT)weight * p[2];
     }
 };
 
 template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 4>, IT>
 {
-    static inline void f(IT* estimation, IT weight, Vec<ET, 4> p)
+    static inline void f(IT* estimation, int weight, Vec<ET, 4> p)
     {
-        estimation[0] += weight * p[0];
-        estimation[1] += weight * p[1];
-        estimation[2] += weight * p[2];
-        estimation[3] += weight * p[3];
+        estimation[0] += (IT)weight * p[0];
+        estimation[1] += (IT)weight * p[1];
+        estimation[2] += (IT)weight * p[2];
+        estimation[3] += (IT)weight * p[3];
     }
 };
 
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index f1a334040..f9c1264b2 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -81,9 +81,9 @@ private:
     int search_window_half_size_;
     int temporal_window_half_size_;
 
-    IT fixed_point_mult_;
+    int fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift;
-    std::vector<IT> almost_dist2weight;
+    std::vector<int> almost_dist2weight;
 
     void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
                                           Array4d<int>& col_dist_sums,
@@ -127,7 +127,8 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::FastNlMeansMultiDenoisingInvoke
     main_extended_src_ = extended_srcs_[temporal_window_half_size_];
     const IT max_estimate_sum_value =
         (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
-    fixed_point_mult_ = std::numeric_limits<IT>::max() / max_estimate_sum_value;
+    fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
+                                          std::numeric_limits<int>::max());
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -147,7 +148,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::FastNlMeansMultiDenoisingInvoke
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
+        int weight = (int)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
 
@@ -266,8 +267,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
                     {
                         int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
 
-                        IT weight =  almost_dist2weight[almostAvgDist];
-                        weights_sum += weight;
+                        int weight =  almost_dist2weight[almostAvgDist];
+                        weights_sum += (IT)weight;
 
                         T p = cur_row_ptr[border_size_ + search_window_x + x];
                         incWithWeight<T, IT>(estimation, weight, p);

From c41efe4e303d51bf207bb54f60d2f4508acfe53d Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 5 Mar 2015 17:50:52 +0100
Subject: [PATCH 24/40] Refactoring in preparation for per-channel h-values

---
 modules/photo/src/denoising.cpp               |  96 ++++++------
 .../src/fast_nlmeans_denoising_invoker.hpp    |  45 +++---
 ...fast_nlmeans_denoising_invoker_commons.hpp | 147 +++++++++++++++---
 .../fast_nlmeans_multi_denoising_invoker.hpp  |  48 +++---
 4 files changed, 216 insertions(+), 120 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index b41f83ec9..29899f791 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -65,23 +65,23 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC4:
             parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         default:
             CV_Error(Error::StsBadArg,
@@ -104,43 +104,43 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC4:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16UC2:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16UC3:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16UC4:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, &h));
             break;
         default:
             CV_Error(Error::StsBadArg,
@@ -239,27 +239,27 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
     {
         case CV_8U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared>(
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC4:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         default:
             CV_Error(Error::StsBadArg,
@@ -285,51 +285,51 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
     {
         case CV_8U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_8UC4:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         case CV_16UC4:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h));
             break;
         default:
             CV_Error(Error::StsBadArg,
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index ec154fbe6..9dea2a02f 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -50,13 +50,13 @@
 
 using namespace cv;
 
-template <typename T, typename IT, typename UIT, typename D>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansDenoisingInvoker :
         public ParallelLoopBody
 {
 public:
     FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
-        int template_window_size, int search_window_size, const float h);
+        int template_window_size, int search_window_size, const float *h);
 
     void operator() (const Range& range) const;
 
@@ -77,7 +77,7 @@ private:
 
     int fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift_;
-    std::vector<int> almost_dist2weight_;
+    std::vector<WT> almost_dist2weight_;
 
     void calcDistSumsForFirstElementInRow(
         int i, Array2d<int>& dist_sums,
@@ -99,12 +99,12 @@ inline int getNearestPowerOf2(int value)
     return p;
 }
 
-template <typename T, typename IT, typename UIT, typename D>
-FastNlMeansDenoisingInvoker<T, IT, UIT, D>::FastNlMeansDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansDenoisingInvoker(
     const Mat& src, Mat& dst,
     int template_window_size,
     int search_window_size,
-    const float h) :
+    const float *h) :
     src_(src), dst_(dst)
 {
     CV_Assert(src.channels() == pixelInfo<T>::channels);
@@ -133,25 +133,20 @@ FastNlMeansDenoisingInvoker<T, IT, UIT, D>::FastNlMeansDenoisingInvoker(
     int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight_.resize(almost_max_dist);
 
-    const double WEIGHT_THRESHOLD = 0.001;
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = (int)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
-
-        almost_dist2weight_[almost_dist] = weight;
+        almost_dist2weight_[almost_dist] =
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
     }
-    CV_Assert(almost_dist2weight_[0] == fixed_point_mult_);
 
     // additional optimization init end
     if (dst_.empty())
         dst_ = Mat::zeros(src_.size(), src_.type());
 }
 
-template <typename T, typename IT, typename UIT, typename D>
-void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
@@ -228,9 +223,9 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
             }
 
             // calc weights
-            IT estimation[pixelInfo<T>::channels], weights_sum = 0;
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<T>::channels];
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] = 0;
+                estimation[channel_num] = weights_sum[channel_num] = 0;
 
             for (int y = 0; y < search_window_size_; y++)
             {
@@ -240,23 +235,23 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range)
                 {
                     int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
                     int weight = almost_dist2weight_[almostAvgDist];
-                    weights_sum += (IT)weight;
-
                     T p = cur_row_ptr[border_size_ + search_window_x + x];
-                    incWithWeight<T, IT>(estimation, weight, p);
+                    incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                 }
             }
 
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum/2) / weights_sum;
+                estimation[channel_num] =
+                    (static_cast<UIT>(estimation[channel_num]) + weights_sum[channel_num]/2) /
+                    weights_sum[channel_num];
 
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
         }
     }
 }
 
-template <typename T, typename IT, typename UIT, typename D>
-inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
     int i,
     Array2d<int>& dist_sums,
     Array3d<int>& col_dist_sums,
@@ -289,8 +284,8 @@ inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElem
         }
 }
 
-template <typename T, typename IT, typename UIT, typename D>
-inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num,
     Array2d<int>& dist_sums,
     Array3d<int>& col_dist_sums,
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index 4d66efe46..53a6f5ed6 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -122,6 +122,36 @@ class DistAbs
         }
     };
 
+    static const double WEIGHT_THRESHOLD = 0.001;
+    template <typename T, typename WT> struct calcWeight_
+    {
+        static inline WT f(double dist, const float *h, int fixed_point_mult)
+        {
+            WT weight = (WT)round(fixed_point_mult *
+                                  std::exp(-dist*dist / (h[0]*h[0] * pixelInfo<T>::channels)));
+            if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
+                weight = 0;
+            return weight;
+        }
+    };
+
+    template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
+    {
+        static inline Vec<ET, n> f(double dist, const float *h, int fixed_point_mult)
+        {
+            Vec<ET, n> res;
+            for (int i=0; i<n; i++)
+            {
+                ET weight = (ET)round(fixed_point_mult *
+                                      std::exp(-dist*dist / (h[i]*h[i] * pixelInfo<T>::channels)));
+                if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
+                    weight = 0;
+                res[i] = weight;
+            }
+            return res;
+        }
+    };
+
 public:
     template <typename T> static inline int calcDist(const T a, const T b)
     {
@@ -142,14 +172,14 @@ public:
         return calcDist<T>(a_down, b_down) - calcDist<T>(a_up, b_up);
     };
 
-    template <typename T>
-    static double calcWeight(double dist, double h)
+    template <typename T, typename WT>
+    static inline WT calcWeight(double dist, const float *h, int fixed_point_mult)
     {
-        return std::exp(-dist*dist / (h * h * pixelInfo<T>::channels));
+        return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
     }
 
     template <typename T>
-    static double maxDist()
+    static inline double maxDist()
     {
         return (int)pixelInfo<T>::sampleMax() * pixelInfo<T>::channels;
     }
@@ -217,6 +247,36 @@ class DistSquared
         }
     };
 
+    static const double WEIGHT_THRESHOLD = 0.001;
+    template <typename T, typename WT> struct calcWeight_
+    {
+        static inline WT f(double dist, const float *h, int fixed_point_mult)
+        {
+            WT weight = (WT)round(fixed_point_mult *
+                                  std::exp(-dist / (h[0]*h[0] * pixelInfo<T>::channels)));
+            if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
+                weight = 0;
+            return weight;
+        }
+    };
+
+    template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
+    {
+        static inline Vec<ET, n> f(double dist, const float *h, int fixed_point_mult)
+        {
+            Vec<ET, n> res;
+            for (int i=0; i<n; i++)
+            {
+                ET weight = (ET)round(fixed_point_mult *
+                                      std::exp(-dist / (h[i]*h[i] * pixelInfo<T>::channels)));
+                if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
+                    weight = 0;
+                res[i] = weight;
+            }
+            return res;
+        }
+    };
+
 public:
     template <typename T> static inline int calcDist(const T a, const T b)
     {
@@ -237,62 +297,111 @@ public:
         return calcUpDownDist_<T>::f(a_up, a_down, b_up, b_down);
     };
 
-    template <typename T>
-    static double calcWeight(double dist, double h)
+    template <typename T, typename WT>
+    static inline WT calcWeight(double dist, const float *h, int fixed_point_mult)
     {
-        return std::exp(-dist / (h * h * pixelInfo<T>::channels));
+        return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
     }
 
     template <typename T>
-    static double maxDist()
+    static inline double maxDist()
     {
         return (int)pixelInfo<T>::sampleMax() * (int)pixelInfo<T>::sampleMax() *
             pixelInfo<T>::channels;
     }
 };
 
-template <typename T, typename IT> struct incWithWeight_
+template <typename T, typename IT, typename WT> struct incWithWeight_
 {
-    static inline void f(IT* estimation, int weight, T p)
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, T p)
     {
         estimation[0] += (IT)weight * p;
+        weights_sum[0] += (IT)weight;
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT>
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT, int>
 {
-    static inline void f(IT* estimation, int weight, Vec<ET, 2> p)
+    static inline void f(IT* estimation, IT* weights_sum, int weight, Vec<ET, 2> p)
     {
         estimation[0] += (IT)weight * p[0];
         estimation[1] += (IT)weight * p[1];
+        weights_sum[0] += (IT)weight;
+        weights_sum[1] += (IT)weight;
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT>
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT, int>
 {
-    static inline void f(IT* estimation, int weight, Vec<ET, 3> p)
+    static inline void f(IT* estimation, IT* weights_sum, int weight, Vec<ET, 3> p)
     {
         estimation[0] += (IT)weight * p[0];
         estimation[1] += (IT)weight * p[1];
         estimation[2] += (IT)weight * p[2];
+        weights_sum[0] += (IT)weight;
+        weights_sum[1] += (IT)weight;
+        weights_sum[2] += (IT)weight;
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 4>, IT>
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 4>, IT, int>
 {
-    static inline void f(IT* estimation, int weight, Vec<ET, 4> p)
+    static inline void f(IT* estimation, IT* weights_sum, int weight, Vec<ET, 4> p)
     {
         estimation[0] += (IT)weight * p[0];
         estimation[1] += (IT)weight * p[1];
         estimation[2] += (IT)weight * p[2];
         estimation[3] += (IT)weight * p[3];
+        weights_sum[0] += (IT)weight;
+        weights_sum[1] += (IT)weight;
+        weights_sum[2] += (IT)weight;
+        weights_sum[3] += (IT)weight;
     }
 };
 
-template <typename T, typename IT>
-static inline void incWithWeight(IT* estimation, IT weight, T p)
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT, Vec<int, 2> >
 {
-    return incWithWeight_<T, IT>::f(estimation, weight, p);
+    static inline void f(IT* estimation, IT* weights_sum, Vec<int, 2> weight, Vec<ET, 2> p)
+    {
+        estimation[0] += (IT)weight[0] * p[0];
+        estimation[1] += (IT)weight[1] * p[1];
+        weights_sum[0] += (IT)weight[0];
+        weights_sum[1] += (IT)weight[1];
+    }
+};
+
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT, Vec<int, 3> >
+{
+    static inline void f(IT* estimation, IT* weights_sum, Vec<int, 3> weight, Vec<ET, 3> p)
+    {
+        estimation[0] += (IT)weight[0] * p[0];
+        estimation[1] += (IT)weight[1] * p[1];
+        estimation[2] += (IT)weight[2] * p[2];
+        weights_sum[0] += (IT)weight[0];
+        weights_sum[1] += (IT)weight[1];
+        weights_sum[2] += (IT)weight[2];
+    }
+};
+
+template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 4>, IT, Vec<int, 4> >
+{
+    static inline void f(IT* estimation, IT* weights_sum, Vec<int, 4> weight, Vec<ET, 4> p)
+    {
+        estimation[0] += (IT)weight[0] * p[0];
+        estimation[1] += (IT)weight[1] * p[1];
+        estimation[2] += (IT)weight[2] * p[2];
+        estimation[3] += (IT)weight[3] * p[3];
+        weights_sum[0] += (IT)weight[0];
+        weights_sum[1] += (IT)weight[1];
+        weights_sum[2] += (IT)weight[2];
+        weights_sum[3] += (IT)weight[3];
+    }
+};
+
+template <typename T, typename IT, typename WT>
+static inline void incWithWeight(IT* estimation, IT* weights_sum, IT weight, T p)
+{
+    return incWithWeight_<T, IT, WT>::f(estimation, weights_sum, weight, p);
 }
 
 template <typename T, typename IT> struct saturateCastFromArray_
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index f9c1264b2..489ee673f 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -50,14 +50,14 @@
 
 using namespace cv;
 
-template <typename T, typename IT, typename UIT, typename D>
+template <typename T, typename IT, typename UIT, typename D, typename WT>
 struct FastNlMeansMultiDenoisingInvoker :
         ParallelLoopBody
 {
 public:
     FastNlMeansMultiDenoisingInvoker(const std::vector<Mat>& srcImgs, int imgToDenoiseIndex,
                                      int temporalWindowSize, Mat& dst, int template_window_size,
-                                     int search_window_size, const float h);
+                                     int search_window_size, const float *h);
 
     void operator() (const Range& range) const;
 
@@ -83,7 +83,7 @@ private:
 
     int fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift;
-    std::vector<int> almost_dist2weight;
+    std::vector<WT> almost_dist2weight;
 
     void calcDistSumsForFirstElementInRow(int i, Array3d<int>& dist_sums,
                                           Array4d<int>& col_dist_sums,
@@ -94,15 +94,15 @@ private:
                                           Array4d<int>& up_col_dist_sums) const;
 };
 
-template <typename T, typename IT, typename UIT, typename D>
-FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::FastNlMeansMultiDenoisingInvoker(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansMultiDenoisingInvoker(
     const std::vector<Mat>& srcImgs,
     int imgToDenoiseIndex,
     int temporalWindowSize,
     cv::Mat& dst,
     int template_window_size,
     int search_window_size,
-    const float h) :
+    const float *h) :
         dst_(dst), extended_srcs_(srcImgs.size())
 {
     CV_Assert(srcImgs.size() > 0);
@@ -144,25 +144,20 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::FastNlMeansMultiDenoisingInvoke
     int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1);
     almost_dist2weight.resize(almost_max_dist);
 
-    const double WEIGHT_THRESHOLD = 0.001;
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        int weight = (int)round(fixed_point_mult_ * D::template calcWeight<T>(dist, h));
-        if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
-            weight = 0;
-
-        almost_dist2weight[almost_dist] = weight;
+        almost_dist2weight[almost_dist] =
+            D::template calcWeight<T, WT>(dist, h, fixed_point_mult_);
     }
-    CV_Assert(almost_dist2weight[0] == fixed_point_mult_);
 
     // additional optimization init end
     if (dst_.empty())
         dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type());
 }
 
-template <typename T, typename IT, typename UIT, typename D>
-void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& range) const
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& range) const
 {
     int row_from = range.start;
     int row_to = range.end - 1;
@@ -248,11 +243,9 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
             }
 
             // calc weights
-            IT weights_sum = 0;
-
-            IT estimation[pixelInfo<T>::channels];
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<T>::channels];
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] = 0;
+                estimation[channel_num] = weights_sum[channel_num] = 0;
 
             for (int d = 0; d < temporal_window_size_; d++)
             {
@@ -268,25 +261,24 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::operator() (const Range& r
                         int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
 
                         int weight =  almost_dist2weight[almostAvgDist];
-                        weights_sum += (IT)weight;
-
                         T p = cur_row_ptr[border_size_ + search_window_x + x];
-                        incWithWeight<T, IT>(estimation, weight, p);
+                        incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                     }
                 }
             }
 
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum;
+                estimation[channel_num] =
+                    (static_cast<UIT>(estimation[channel_num]) + weights_sum[channel_num] / 2) /
+                    weights_sum[channel_num];
 
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
-
         }
     }
 }
 
-template <typename T, typename IT, typename UIT, typename D>
-inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirstElementInRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForFirstElementInRow(
         int i, Array3d<int>& dist_sums, Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {
     int j = 0;
@@ -326,8 +318,8 @@ inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForFirs
     }
 }
 
-template <typename T, typename IT, typename UIT, typename D>
-inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D>::calcDistSumsForElementInFirstRow(
+template <typename T, typename IT, typename UIT, typename D, typename WT>
+inline void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::calcDistSumsForElementInFirstRow(
     int i, int j, int first_col_num, Array3d<int>& dist_sums,
     Array4d<int>& col_dist_sums, Array4d<int>& up_col_dist_sums) const
 {

From 1e82a67cc4d082abe9437dd163314a543bd90232 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 6 Mar 2015 14:28:43 +0100
Subject: [PATCH 25/40] Additional refactoring

---
 .../src/fast_nlmeans_denoising_invoker.hpp    | 17 ++--
 ...fast_nlmeans_denoising_invoker_commons.hpp | 91 +++++++++++--------
 .../fast_nlmeans_multi_denoising_invoker.hpp  | 17 ++--
 3 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 9dea2a02f..ff35550df 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -75,7 +75,7 @@ private:
     int template_window_half_size_;
     int search_window_half_size_;
 
-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift_;
     std::vector<WT> almost_dist2weight_;
 
@@ -120,7 +120,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansDenoisingInvoker(
     const IT max_estimate_sum_value =
         (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
     fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
-                                          std::numeric_limits<int>::max());
+                                          pixelInfo<WT>::sampleMax());
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -223,9 +223,11 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& ra
             }
 
             // calc weights
-            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<T>::channels];
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] = weights_sum[channel_num] = 0;
+                estimation[channel_num] = 0;
+            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
+                weights_sum[channel_num] = 0;
 
             for (int y = 0; y < search_window_size_; y++)
             {
@@ -240,11 +242,8 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& ra
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] =
-                    (static_cast<UIT>(estimation[channel_num]) + weights_sum[channel_num]/2) /
-                    weights_sum[channel_num];
-
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
+                                                                                      weights_sum);
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
         }
     }
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index 53a6f5ed6..df8e4703e 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -122,11 +122,11 @@ class DistAbs
         }
     };
 
-    static const double WEIGHT_THRESHOLD = 0.001;
     template <typename T, typename WT> struct calcWeight_
     {
-        static inline WT f(double dist, const float *h, int fixed_point_mult)
+        static inline WT f(double dist, const float *h, WT fixed_point_mult)
         {
+            static const double WEIGHT_THRESHOLD = 0.001;
             WT weight = (WT)round(fixed_point_mult *
                                   std::exp(-dist*dist / (h[0]*h[0] * pixelInfo<T>::channels)));
             if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
@@ -137,17 +137,11 @@ class DistAbs
 
     template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
     {
-        static inline Vec<ET, n> f(double dist, const float *h, int fixed_point_mult)
+        static inline Vec<ET, n> f(double dist, const float *h, ET fixed_point_mult)
         {
             Vec<ET, n> res;
             for (int i=0; i<n; i++)
-            {
-                ET weight = (ET)round(fixed_point_mult *
-                                      std::exp(-dist*dist / (h[i]*h[i] * pixelInfo<T>::channels)));
-                if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
-                    weight = 0;
-                res[i] = weight;
-            }
+                res[i] = calcWeight<T, ET>(dist, &h[i], fixed_point_mult);
             return res;
         }
     };
@@ -247,11 +241,11 @@ class DistSquared
         }
     };
 
-    static const double WEIGHT_THRESHOLD = 0.001;
     template <typename T, typename WT> struct calcWeight_
     {
         static inline WT f(double dist, const float *h, int fixed_point_mult)
         {
+            static const double WEIGHT_THRESHOLD = 0.001;
             WT weight = (WT)round(fixed_point_mult *
                                   std::exp(-dist / (h[0]*h[0] * pixelInfo<T>::channels)));
             if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
@@ -266,13 +260,7 @@ class DistSquared
         {
             Vec<ET, n> res;
             for (int i=0; i<n; i++)
-            {
-                ET weight = (ET)round(fixed_point_mult *
-                                      std::exp(-dist / (h[i]*h[i] * pixelInfo<T>::channels)));
-                if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
-                    weight = 0;
-                res[i] = weight;
-            }
+                res[i] = calcWeight<T, ET>(dist, &h[i], fixed_point_mult);
             return res;
         }
     };
@@ -320,48 +308,42 @@ template <typename T, typename IT, typename WT> struct incWithWeight_
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT, int>
+template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 2>, IT, WT>
 {
-    static inline void f(IT* estimation, IT* weights_sum, int weight, Vec<ET, 2> p)
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 2> p)
     {
         estimation[0] += (IT)weight * p[0];
         estimation[1] += (IT)weight * p[1];
         weights_sum[0] += (IT)weight;
-        weights_sum[1] += (IT)weight;
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT, int>
+template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 3>, IT, WT>
 {
-    static inline void f(IT* estimation, IT* weights_sum, int weight, Vec<ET, 3> p)
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 3> p)
     {
         estimation[0] += (IT)weight * p[0];
         estimation[1] += (IT)weight * p[1];
         estimation[2] += (IT)weight * p[2];
         weights_sum[0] += (IT)weight;
-        weights_sum[1] += (IT)weight;
-        weights_sum[2] += (IT)weight;
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 4>, IT, int>
+template <typename ET, typename IT, typename WT> struct incWithWeight_<Vec<ET, 4>, IT, WT>
 {
-    static inline void f(IT* estimation, IT* weights_sum, int weight, Vec<ET, 4> p)
+    static inline void f(IT* estimation, IT* weights_sum, WT weight, Vec<ET, 4> p)
     {
         estimation[0] += (IT)weight * p[0];
         estimation[1] += (IT)weight * p[1];
         estimation[2] += (IT)weight * p[2];
         estimation[3] += (IT)weight * p[3];
         weights_sum[0] += (IT)weight;
-        weights_sum[1] += (IT)weight;
-        weights_sum[2] += (IT)weight;
-        weights_sum[3] += (IT)weight;
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT, Vec<int, 2> >
+template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 2>, IT, Vec<EW, 2> >
 {
-    static inline void f(IT* estimation, IT* weights_sum, Vec<int, 2> weight, Vec<ET, 2> p)
+    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 2> weight, Vec<ET, 2> p)
     {
         estimation[0] += (IT)weight[0] * p[0];
         estimation[1] += (IT)weight[1] * p[1];
@@ -370,9 +352,9 @@ template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 2>, IT, Vec<in
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT, Vec<int, 3> >
+template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 3>, IT, Vec<EW, 3> >
 {
-    static inline void f(IT* estimation, IT* weights_sum, Vec<int, 3> weight, Vec<ET, 3> p)
+    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 3> weight, Vec<ET, 3> p)
     {
         estimation[0] += (IT)weight[0] * p[0];
         estimation[1] += (IT)weight[1] * p[1];
@@ -383,9 +365,9 @@ template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 3>, IT, Vec<in
     }
 };
 
-template <typename ET, typename IT> struct incWithWeight_<Vec<ET, 4>, IT, Vec<int, 4> >
+template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 4>, IT, Vec<EW, 4> >
 {
-    static inline void f(IT* estimation, IT* weights_sum, Vec<int, 4> weight, Vec<ET, 4> p)
+    static inline void f(IT* estimation, IT* weights_sum, Vec<EW, 4> weight, Vec<ET, 4> p)
     {
         estimation[0] += (IT)weight[0] * p[0];
         estimation[1] += (IT)weight[1] * p[1];
@@ -404,6 +386,43 @@ static inline void incWithWeight(IT* estimation, IT* weights_sum, IT weight, T p
     return incWithWeight_<T, IT, WT>::f(estimation, weights_sum, weight, p);
 }
 
+template <typename IT, typename UIT, int nc, int nw> struct divByWeightsSum_
+{
+    static inline void f(IT* estimation, IT* weights_sum);
+};
+
+template <typename IT, typename UIT> struct divByWeightsSum_<IT, UIT, 1, 1>
+{
+    static inline void f(IT* estimation, IT* weights_sum)
+    {
+        estimation[0] = (static_cast<UIT>(estimation[0]) + weights_sum[0]/2) / weights_sum[0];
+    }
+};
+
+template <typename IT, typename UIT, int n> struct divByWeightsSum_<IT, UIT, n, 1>
+{
+    static inline void f(IT* estimation, IT* weights_sum)
+    {
+        for (size_t i = 0; i < n; i++)
+            estimation[i] = (static_cast<UIT>(estimation[i]) + weights_sum[0]/2) / weights_sum[0];
+    }
+};
+
+template <typename IT, typename UIT, int n> struct divByWeightsSum_<IT, UIT, n, n>
+{
+    static inline void f(IT* estimation, IT* weights_sum)
+    {
+        for (size_t i = 0; i < n; i++)
+            estimation[i] = (static_cast<UIT>(estimation[i]) + weights_sum[i]/2) / weights_sum[i];
+    }
+};
+
+template <typename IT, typename UIT, int nc, int nw>
+static inline void divByWeightsSum(IT* estimation, IT* weights_sum)
+{
+    return divByWeightsSum_<IT, UIT, nc, nw>::f(estimation, weights_sum);
+}
+
 template <typename T, typename IT> struct saturateCastFromArray_
 {
     static inline T f(IT* estimation)
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 489ee673f..cd3833a56 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -81,7 +81,7 @@ private:
     int search_window_half_size_;
     int temporal_window_half_size_;
 
-    int fixed_point_mult_;
+    typename pixelInfo<WT>::sampleType fixed_point_mult_;
     int almost_template_window_size_sq_bin_shift;
     std::vector<WT> almost_dist2weight;
 
@@ -128,7 +128,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::FastNlMeansMultiDenoisingIn
     const IT max_estimate_sum_value =
         (IT)temporal_window_size_ * (IT)search_window_size_ * (IT)search_window_size_ * (IT)pixelInfo<T>::sampleMax();
     fixed_point_mult_ = (int)std::min<IT>(std::numeric_limits<IT>::max() / max_estimate_sum_value,
-                                          std::numeric_limits<int>::max());
+                                          pixelInfo<WT>::sampleMax());
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
@@ -243,9 +243,11 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Rang
             }
 
             // calc weights
-            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<T>::channels];
+            IT estimation[pixelInfo<T>::channels], weights_sum[pixelInfo<WT>::channels];
             for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] = weights_sum[channel_num] = 0;
+                estimation[channel_num] = 0;
+            for (size_t channel_num = 0; channel_num < pixelInfo<WT>::channels; channel_num++)
+                weights_sum[channel_num] = 0;
 
             for (int d = 0; d < temporal_window_size_; d++)
             {
@@ -267,11 +269,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Rang
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
-                estimation[channel_num] =
-                    (static_cast<UIT>(estimation[channel_num]) + weights_sum[channel_num] / 2) /
-                    weights_sum[channel_num];
-
+            divByWeightsSum<IT, UIT, pixelInfo<T>::channels, pixelInfo<WT>::channels>(estimation,
+                                                                                      weights_sum);
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
         }
     }

From 41ffcc27dd0887ee5942a9d48761f6958df0f318 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 6 Mar 2015 15:06:11 +0100
Subject: [PATCH 26/40] Added support for h = 0.0

---
 ...fast_nlmeans_denoising_invoker_commons.hpp | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index df8e4703e..efd482f6b 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -126,11 +126,13 @@ class DistAbs
     {
         static inline WT f(double dist, const float *h, WT fixed_point_mult)
         {
+            double w = std::exp(-dist*dist / (h[0]*h[0] * pixelInfo<T>::channels));
+            if (std::isnan(w)) w = 1.0; // Handle h = 0.0
+
             static const double WEIGHT_THRESHOLD = 0.001;
-            WT weight = (WT)round(fixed_point_mult *
-                                  std::exp(-dist*dist / (h[0]*h[0] * pixelInfo<T>::channels)));
-            if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
-                weight = 0;
+            WT weight = (WT)round(fixed_point_mult * w);
+            if (weight < WEIGHT_THRESHOLD * fixed_point_mult) weight = 0;
+
             return weight;
         }
     };
@@ -167,7 +169,8 @@ public:
     };
 
     template <typename T, typename WT>
-    static inline WT calcWeight(double dist, const float *h, int fixed_point_mult)
+    static inline WT calcWeight(double dist, const float *h,
+                                typename pixelInfo<WT>::sampleType fixed_point_mult)
     {
         return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
     }
@@ -243,20 +246,22 @@ class DistSquared
 
     template <typename T, typename WT> struct calcWeight_
     {
-        static inline WT f(double dist, const float *h, int fixed_point_mult)
+        static inline WT f(double dist, const float *h, WT fixed_point_mult)
         {
+            double w = std::exp(-dist / (h[0]*h[0] * pixelInfo<T>::channels));
+            if (std::isnan(w)) w = 1.0; // Handle h = 0.0
+
             static const double WEIGHT_THRESHOLD = 0.001;
-            WT weight = (WT)round(fixed_point_mult *
-                                  std::exp(-dist / (h[0]*h[0] * pixelInfo<T>::channels)));
-            if (weight < WEIGHT_THRESHOLD * fixed_point_mult)
-                weight = 0;
+            WT weight = (WT)round(fixed_point_mult * w);
+            if (weight < WEIGHT_THRESHOLD * fixed_point_mult) weight = 0;
+
             return weight;
         }
     };
 
     template <typename T, typename ET, int n> struct calcWeight_<T, Vec<ET, n> >
     {
-        static inline Vec<ET, n> f(double dist, const float *h, int fixed_point_mult)
+        static inline Vec<ET, n> f(double dist, const float *h, ET fixed_point_mult)
         {
             Vec<ET, n> res;
             for (int i=0; i<n; i++)
@@ -286,7 +291,8 @@ public:
     };
 
     template <typename T, typename WT>
-    static inline WT calcWeight(double dist, const float *h, int fixed_point_mult)
+    static inline WT calcWeight(double dist, const float *h,
+                                typename pixelInfo<WT>::sampleType fixed_point_mult)
     {
         return calcWeight_<T, WT>::f(dist, h, fixed_point_mult);
     }

From 324fa26848f1020d125bd45e1fa5459c07fb092a Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 6 Mar 2015 19:07:13 +0100
Subject: [PATCH 27/40] Refactoring of OpenCL implementation

---
 modules/photo/src/denoising.cpp               |  6 ++-
 .../src/fast_nlmeans_denoising_opencl.hpp     | 38 ++++++++++++-------
 modules/photo/src/opencl/nlmeans.cl           | 31 ++++++++-------
 3 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 29899f791..30f638d4c 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -51,7 +51,8 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     Size src_size = _src.size();
     CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, false))
+               ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
+                                        templateWindowSize, searchWindowSize, false))
 
     Mat src = _src.getMat();
     _dst.create(src_size, src.type());
@@ -95,7 +96,8 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
     Size src_size = _src.size();
     CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize, true))
+               ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
+                                        templateWindowSize, searchWindowSize, true))
 
     Mat src = _src.getMat();
     _dst.create(src_size, src.type());
diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index 2fa11a351..a06dc6192 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -29,7 +29,7 @@ static int divUp(int a, int b)
 }
 
 template <typename FT, typename ST, typename WT>
-static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn,
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT *h, int hn, int cn,
                                       int & almostTemplateWindowSizeSqBinShift, bool abs)
 {
     const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
@@ -53,24 +53,32 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
     int maxDist = abs ? std::numeric_limits<ST>::max() * cn :
         std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
     int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
-    FT den = 1.0f / (h * h * cn);
+    FT den[4];
+    CV_Assert(hn > 0 && hn <= 4);
+    for (int i=0; i<hn; i++)
+        den[i] = 1.0f / (h[i] * h[i] * cn);
 
-    almostDist2Weight.create(1, almostMaxDist, CV_32SC1);
+    almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
 
+    char buf[40];
     ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
-                  format("-D OP_CALC_WEIGHTS -D FT=%s%s%s", ocl::typeToStr(depth),
+                  format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
+                         " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
+                         ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
+                         ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "", abs ? " -D ABS" : ""));
     if (k.empty())
         return false;
 
     k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
-           almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD);
+           almostDist2ActualDistMultiplier, fixedPointMult,
+           ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
 
     size_t globalsize[1] = { almostMaxDist };
     return k.run(1, globalsize, NULL, false);
 }
 
-static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
+static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float *h, int hn,
                                      int templateWindowSize, int searchWindowSize, bool abs)
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
@@ -89,18 +97,22 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
 
     char buf[4][40];
     String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
-                         " -D pixel_t=%s -D int_t=%s"
-                         " -D weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
+                         " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
+                         " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
                          " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
                          " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
                          templateWindowSize, searchWindowSize,
                          ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
-                         depth == CV_8U ? ocl::typeToStr(CV_32S) : "long",
+                         ocl::typeToStr(CV_32SC(hn)),
+                         depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
+                         format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
+                         depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
+                         format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
                          depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
-                         (sprintf(buf[0], "long%d", cn), buf[0]),
+                         format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
                          depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
-                         (sprintf(buf[1], "convert_long%d", cn), buf[1]),
+                         format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
                          BLOCK_COLS, BLOCK_ROWS,
                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
                          ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
@@ -115,13 +127,13 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h,
     if ((depth == CV_8U &&
          !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
                                                        searchWindowSize, templateWindowSize,
-                                                       h, cn,
+                                                       h, hn, cn,
                                                        almostTemplateWindowSizeSqBinShift,
                                                        abs)) ||
         (depth == CV_16U &&
          !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
                                                           searchWindowSize, templateWindowSize,
-                                                          h, cn,
+                                                          h, hn, cn,
                                                           almostTemplateWindowSizeSqBinShift,
                                                           abs)))
         return false;
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl
index 11837a5fc..936aed6fa 100644
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -20,9 +20,9 @@
 
 #ifdef OP_CALC_WEIGHTS
 
-__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist,
+__kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int almostMaxDist,
                                     FT almostDist2ActualDistMultiplier, int fixedPointMult,
-                                    FT den, FT WEIGHT_THRESHOLD)
+                                    w_t den, FT WEIGHT_THRESHOLD)
 {
     int almostDist = get_global_id(0);
 
@@ -30,14 +30,13 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost
     {
         FT dist = almostDist * almostDist2ActualDistMultiplier;
 #ifdef ABS
-        int weight = convert_int_sat_rte(fixedPointMult * exp(-dist*dist * den));
+        w_t w = exp((w_t)(-dist*dist) * den);
 #else
-        int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den));
+        w_t w = exp((w_t)(-dist) * den);
 #endif
-        if (weight < WEIGHT_THRESHOLD * fixedPointMult)
-            weight = 0;
-
-        almostDist2Weight[almostDist] = weight;
+        wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));
+        almostDist2Weight[almostDist] =
+            weight < WEIGHT_THRESHOLD * fixedPointMult ? (wlut_t)0 : weight;
     }
 }
 
@@ -208,14 +207,14 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset
 }
 
 inline void convolveWindow(__global const uchar * src, int src_step, int src_offset,
-                           __local int * dists, __global const int * almostDist2Weight,
+                           __local int * dists, __global const wlut_t * almostDist2Weight,
                            __global uchar * dst, int dst_step, int dst_offset,
                            int y, int x, int id, __local weight_t * weights_local,
                            __local sum_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift)
 {
     int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2;
-    weight_t weights = 0;
-    sum_t weighted_sum = (sum_t)(0);
+    weight_t weights = (weight_t)0;
+    sum_t weighted_sum = (sum_t)0;
 
     for (int i = id; i < SEARCH_SIZE_SQ; i += CTA_SIZE)
     {
@@ -223,10 +222,10 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
         sum_t src_value = convert_sum_t(*(__global const pixel_t *)(src + src_index));
 
         int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift;
-        int weight = almostDist2Weight[almostAvgDist];
+        weight_t weight = convert_weight_t(almostDist2Weight[almostAvgDist]);
 
-        weights += (weight_t)weight;
-        weighted_sum += (sum_t)(weight) * src_value;
+        weights += weight;
+        weighted_sum += (sum_t)weight * src_value;
     }
 
     weights_local[id] = weights;
@@ -251,13 +250,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off
             weighted_sum_local[2] + weighted_sum_local[3];
         weight_t weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3];
 
-        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)(weights_local_0));
+        *(__global pixel_t *)(dst + dst_index) = convert_pixel_t(weighted_sum_local_0 / (sum_t)weights_local_0);
     }
 }
 
 __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset,
                                    __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
-                                   __global const int * almostDist2Weight, __global uchar * buffer,
+                                   __global const wlut_t * almostDist2Weight, __global uchar * buffer,
                                    int almostTemplateWindowSizeSqBinShift)
 {
     int block_x = get_group_id(0), nblocks_x = get_num_groups(0);

From cc8d94c6fc977d116beb81c6a50f123790d01bef Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 6 Mar 2015 20:43:55 +0100
Subject: [PATCH 28/40] Addition of per-channel h-values for
 fastNlMeansDenoising[Multi][Abs]

---
 modules/photo/include/opencv2/photo.hpp       | 126 +++++++++-
 modules/photo/src/denoising.cpp               | 221 ++++++++++++++++++
 .../src/fast_nlmeans_denoising_invoker.hpp    |   2 +-
 ...fast_nlmeans_denoising_invoker_commons.hpp |   2 +-
 .../fast_nlmeans_multi_denoising_invoker.hpp  |   2 +-
 modules/photo/src/opencl/nlmeans.cl           |   2 +-
 6 files changed, 344 insertions(+), 11 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index 446e81750..5e11333ee 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -138,6 +138,31 @@ parameter.
 CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h = 3,
         int templateWindowSize = 7, int searchWindowSize = 21);
 
+/** @brief Perform image denoising using Non-local Means Denoising algorithm
+<http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
+optimizations. Noise expected to be a gaussian white noise
+
+@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, one per
+channel. Big h value perfectly removes noise but also removes image
+details, smaller h value preserves details but also preserves some
+noise
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float *h,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
 /** @brief Perform image denoising using Non-local Means Denoising
 algorithm <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/>
 with several computational optimizations. Noise expected to be a
@@ -163,6 +188,33 @@ parameter.
 CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst, float h = 3,
         int templateWindowSize = 7, int searchWindowSize = 21);
 
+/** @brief Perform image denoising using Non-local Means Denoising
+algorithm <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/>
+with several computational optimizations. Noise expected to be a
+gaussian white noise. Uses squared sum of absolute value distances
+instead of sum of squared distances for weight calculation
+
+@param src Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel or 4-channel image.
+@param dst Output image with the same size and type as src .
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, one per
+channel. Big h value perfectly removes noise but also removes image
+details, smaller h value preserves details but also preserves some
+noise
+
+This function expected to be applied to grayscale images. For colored images look at
+fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
+image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
+image to CIELAB colorspace and then separately denoise L and AB components with different h
+parameter.
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst, float *h,
+        int templateWindowSize = 7, int searchWindowSize = 21);
+
 /** @brief Modification of fastNlMeansDenoising function for colored images
 
 @param src Input 8-bit 3-channel image.
@@ -204,14 +256,73 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
-removes noise but also removes image details, smaller h value preserves details but also preserves
-some noise
+@param h Parameter regulating filter strength. Bigger h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
         int imgToDenoiseIndex, int temporalWindowSize,
         float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
 
+/** @brief Modification of fastNlMeansDenoising function for images sequence where consequtive images have been
+captured in small period of time. For example video. This version of the function is for grayscale
+images or for manual manipulation with colorspaces. For more details see
+<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
+
+@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
+4-channel images sequence. All images should have the same type and
+size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Array of parameters regulating filter strength, one for each
+channel. Bigger h value perfectly removes noise but also removes image
+details, smaller h value preserves details but also preserves some
+noise
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float *h , int templateWindowSize = 7, int searchWindowSize = 21);
+
+/** @brief Modification of fastNlMeansDenoising function for images
+sequence where consequtive images have been captured in small period
+of time. For example video. This version of the function is for
+grayscale images or for manual manipulation with colorspaces. For more
+details see
+<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>. Uses
+squared sum of absolute value distances instead of sum of squared
+distances for weight calculation
+
+@param srcImgs Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel
+or 4-channel images sequence. All images should have the same type and
+size.
+@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
+@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
+be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
+imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
+srcImgs[imgToDenoiseIndex] image.
+@param dst Output image with the same size and type as srcImgs images.
+@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
+Should be odd. Recommended value 7 pixels
+@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
+given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
+denoising time. Recommended value 21 pixels
+@param h Parameter regulating filter strength. Bigger h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
+ */
+CV_EXPORTS_W void fastNlMeansDenoisingMultiAbs( InputArrayOfArrays srcImgs, OutputArray dst,
+        int imgToDenoiseIndex, int temporalWindowSize,
+        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+
 /** @brief Modification of fastNlMeansDenoising function for images
 sequence where consequtive images have been captured in small period
 of time. For example video. This version of the function is for
@@ -235,13 +346,14 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Parameter regulating filter strength for luminance component. Bigger h value perfectly
-removes noise but also removes image details, smaller h value preserves details but also preserves
-some noise
+@param h Array of parameters regulating filter strength, one for each
+channel. Bigger h value perfectly removes noise but also removes image
+details, smaller h value preserves details but also preserves some
+noise
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMultiAbs( InputArrayOfArrays srcImgs, OutputArray dst,
         int imgToDenoiseIndex, int temporalWindowSize,
-        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
+        float *h, int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
 
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 30f638d4c..9f63254b0 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -90,6 +90,51 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     }
 }
 
+void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float *h,
+                               int templateWindowSize, int searchWindowSize)
+{
+    Size src_size = _src.size();
+    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
+               src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
+               ocl_fastNlMeansDenoising(_src, _dst, h, CV_MAT_CN(_src.type()),
+                                        templateWindowSize, searchWindowSize, false))
+
+    Mat src = _src.getMat();
+    _dst.create(src_size, src.type());
+    Mat dst = _dst.getMat();
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if(tegra::fastNlMeansDenoising(src, dst, h, templateWindowSize, searchWindowSize))
+        return;
+#endif
+
+    switch (src.type()) {
+        case CV_8U:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC2:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC3:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
                                   int  templateWindowSize, int searchWindowSize)
 {
@@ -150,6 +195,66 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
     }
 }
 
+void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float *h,
+                                  int  templateWindowSize, int searchWindowSize)
+{
+    Size src_size = _src.size();
+    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
+               src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
+               ocl_fastNlMeansDenoising(_src, _dst, h, CV_MAT_CN(_src.type()),
+                                        templateWindowSize, searchWindowSize, true))
+
+    Mat src = _src.getMat();
+    _dst.create(src_size, src.type());
+    Mat dst = _dst.getMat();
+
+    switch (src.type()) {
+        case CV_8U:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC2:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC3:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC4:
+            parallel_for_(cv::Range(0, src.rows),
+                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
                                       float h, float hForColorComponents,
                                       int templateWindowSize, int searchWindowSize)
@@ -269,6 +374,52 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
     }
 }
 
+void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
+                                    int imgToDenoiseIndex, int temporalWindowSize,
+                                    float *h, int templateWindowSize, int searchWindowSize)
+{
+    std::vector<Mat> srcImgs;
+    _srcImgs.getMatVector(srcImgs);
+
+    fastNlMeansDenoisingMultiCheckPreconditions(
+        srcImgs, imgToDenoiseIndex,
+        temporalWindowSize, templateWindowSize, searchWindowSize);
+
+    _dst.create(srcImgs[0].size(), srcImgs[0].type());
+    Mat dst = _dst.getMat();
+
+    switch (srcImgs[0].type())
+    {
+        case CV_8U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC2:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC3:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                        int imgToDenoiseIndex, int temporalWindowSize,
                                        float h, int templateWindowSize, int searchWindowSize)
@@ -339,6 +490,76 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
     }
 }
 
+void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray _dst,
+                                       int imgToDenoiseIndex, int temporalWindowSize,
+                                       float *h, int templateWindowSize, int searchWindowSize)
+{
+    std::vector<Mat> srcImgs;
+    _srcImgs.getMatVector(srcImgs);
+
+    fastNlMeansDenoisingMultiCheckPreconditions(
+        srcImgs, imgToDenoiseIndex,
+        temporalWindowSize, templateWindowSize, searchWindowSize);
+
+    _dst.create(srcImgs[0].size(), srcImgs[0].type());
+    Mat dst = _dst.getMat();
+
+    switch (srcImgs[0].type())
+    {
+        case CV_8U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC2:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC3:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_8UC4:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC4:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                            int imgToDenoiseIndex, int temporalWindowSize,
                                            float h, float hForColorComponents,
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index ff35550df..6e74acf03 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -236,7 +236,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Range& ra
                 for (int x = 0; x < search_window_size_; x++)
                 {
                     int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_;
-                    int weight = almost_dist2weight_[almostAvgDist];
+                    WT weight = almost_dist2weight_[almostAvgDist];
                     T p = cur_row_ptr[border_size_ + search_window_x + x];
                     incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                 }
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index efd482f6b..9833ea7d3 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -387,7 +387,7 @@ template <typename ET, typename IT, typename EW> struct incWithWeight_<Vec<ET, 4
 };
 
 template <typename T, typename IT, typename WT>
-static inline void incWithWeight(IT* estimation, IT* weights_sum, IT weight, T p)
+static inline void incWithWeight(IT* estimation, IT* weights_sum, WT weight, T p)
 {
     return incWithWeight_<T, IT, WT>::f(estimation, weights_sum, weight, p);
 }
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index cd3833a56..3f13f400d 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -262,7 +262,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT, D, WT>::operator() (const Rang
                     {
                         int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift;
 
-                        int weight =  almost_dist2weight[almostAvgDist];
+                        WT weight =  almost_dist2weight[almostAvgDist];
                         T p = cur_row_ptr[border_size_ + search_window_x + x];
                         incWithWeight<T, IT, WT>(estimation, weights_sum, weight, p);
                     }
diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl
index 936aed6fa..879665f48 100644
--- a/modules/photo/src/opencl/nlmeans.cl
+++ b/modules/photo/src/opencl/nlmeans.cl
@@ -36,7 +36,7 @@ __kernel void calcAlmostDist2Weight(__global wlut_t * almostDist2Weight, int alm
 #endif
         wlut_t weight = convert_wlut_t(fixedPointMult * (isnan(w) ? (w_t)1.0 : w));
         almostDist2Weight[almostDist] =
-            weight < WEIGHT_THRESHOLD * fixedPointMult ? (wlut_t)0 : weight;
+            weight < (wlut_t)(WEIGHT_THRESHOLD * fixedPointMult) ? (wlut_t)0 : weight;
     }
 }
 

From 21160137d4a8eaae0be2c2545ab7e18cd3bfc7a3 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Fri, 6 Mar 2015 20:44:31 +0100
Subject: [PATCH 29/40] Addition of test cases

---
 modules/photo/test/ocl/test_denoising.cpp | 50 +++++++++++++++++++----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index 4aba4b51e..3b6998f06 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -16,7 +16,7 @@ namespace ocl {
 PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
 {
     int cn, templateWindowSize, searchWindowSize;
-    float h;
+    float h[4];
     bool use_roi, use_image;
 
     TEST_DECLARE_INPUT_PARAMETER(src);
@@ -30,7 +30,10 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
 
         templateWindowSize = 7;
         searchWindowSize = 21;
-        h = 3.0f;
+
+        ASSERT_TRUE(cn > 0 && cn <= 4);
+        for (int i=0; i<cn; i++)
+            h[i] = 3.0f + 0.5f*i;
     }
 
     virtual void generateTestData()
@@ -48,7 +51,6 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
         Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
         if (use_image) {
-            ASSERT_TRUE(cn == 1 || cn == 2 || cn == 3 || cn == 4);
             if (cn == 2) {
                 int from_to[] = { 0,0, 1,1 };
                 src_roi.create(roiSize, type);
@@ -78,6 +80,21 @@ OCL_TEST_P(FastNlMeansDenoising, Mat)
     {
         generateTestData();
 
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h[0], templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h[0], templateWindowSize, searchWindowSize));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1);
+    }
+}
+
+typedef FastNlMeansDenoisingTestBase FastNlMeansDenoising_hsep;
+
+OCL_TEST_P(FastNlMeansDenoising_hsep, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
         OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
         OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
 
@@ -93,6 +110,21 @@ OCL_TEST_P(FastNlMeansDenoisingAbs, Mat)
     {
         generateTestData();
 
+        OCL_OFF(cv::fastNlMeansDenoisingAbs(src_roi, dst_roi, h[0], templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoisingAbs(usrc_roi, udst_roi, h[0], templateWindowSize, searchWindowSize));
+
+        OCL_EXPECT_MATS_NEAR(dst, 1);
+    }
+}
+
+typedef FastNlMeansDenoisingTestBase FastNlMeansDenoisingAbs_hsep;
+
+OCL_TEST_P(FastNlMeansDenoisingAbs_hsep, Mat)
+{
+    for (int j = 0; j < test_loop_times; j++)
+    {
+        generateTestData();
+
         OCL_OFF(cv::fastNlMeansDenoisingAbs(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
         OCL_ON(cv::fastNlMeansDenoisingAbs(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
 
@@ -108,17 +140,21 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
     {
         generateTestData();
 
-        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h[0], h[0], templateWindowSize, searchWindowSize));
+        OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h[0], h[0], templateWindowSize, searchWindowSize));
 
         OCL_EXPECT_MATS_NEAR(dst, 1);
     }
 }
 
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
-                            Combine(Values(1, 2, 3, 4), Bool(), Bool()));
+                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising_hsep,
+                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingAbs,
-                            Combine(Values(1, 2, 3, 4), Bool(), Bool()));
+                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
+OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingAbs_hsep,
+                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
                             Combine(Values(3, 4), Bool(), Values(false)));
 

From c44488629a41d71c14e81b0705f9dbebad541bfe Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 9 Mar 2015 15:52:16 +0100
Subject: [PATCH 30/40] Changed parameters of fastNlMeansDenoising[Multi][Abs]
 from float * to std::vector<float>

---
 modules/photo/include/opencv2/photo.hpp   |  40 +-
 modules/photo/src/denoising.cpp           | 482 +++++++++-------------
 modules/photo/test/ocl/test_denoising.cpp |   5 +-
 3 files changed, 225 insertions(+), 302 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index 5e11333ee..d613c2420 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -149,10 +149,10 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Array of parameters regulating filter strength, one per
-channel. Big h value perfectly removes noise but also removes image
-details, smaller h value preserves details but also preserves some
-noise
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in src. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
 
 This function expected to be applied to grayscale images. For colored images look at
 fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
@@ -160,7 +160,7 @@ image in different colorspaces. Such approach is used in fastNlMeansDenoisingCol
 image to CIELAB colorspace and then separately denoise L and AB components with different h
 parameter.
  */
-CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float *h,
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, std::vector<float> h,
         int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Perform image denoising using Non-local Means Denoising
@@ -201,10 +201,10 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Array of parameters regulating filter strength, one per
-channel. Big h value perfectly removes noise but also removes image
-details, smaller h value preserves details but also preserves some
-noise
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in src. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
 
 This function expected to be applied to grayscale images. For colored images look at
 fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
@@ -212,7 +212,7 @@ image in different colorspaces. Such approach is used in fastNlMeansDenoisingCol
 image to CIELAB colorspace and then separately denoise L and AB components with different h
 parameter.
  */
-CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst, float *h,
+CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst, std::vector<float> h,
         int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Modification of fastNlMeansDenoising function for colored images
@@ -283,14 +283,14 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Array of parameters regulating filter strength, one for each
-channel. Bigger h value perfectly removes noise but also removes image
-details, smaller h value preserves details but also preserves some
-noise
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in src. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
         int imgToDenoiseIndex, int temporalWindowSize,
-        float *h , int templateWindowSize = 7, int searchWindowSize = 21);
+        std::vector<float> h , int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Modification of fastNlMeansDenoising function for images
 sequence where consequtive images have been captured in small period
@@ -346,14 +346,14 @@ Should be odd. Recommended value 7 pixels
 @param searchWindowSize Size in pixels of the window that is used to compute weighted average for
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
-@param h Array of parameters regulating filter strength, one for each
-channel. Bigger h value perfectly removes noise but also removes image
-details, smaller h value preserves details but also preserves some
-noise
+@param h Array of parameters regulating filter strength, either one
+parameter applied to all channels or one per channel in src. Big h value
+perfectly removes noise but also removes image details, smaller h
+value preserves details but also preserves some noise
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMultiAbs( InputArrayOfArrays srcImgs, OutputArray dst,
         int imgToDenoiseIndex, int temporalWindowSize,
-        float *h, int templateWindowSize = 7, int searchWindowSize = 21);
+        std::vector<float> h, int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
 
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 9f63254b0..7251b6446 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -48,55 +48,20 @@
 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
                                int templateWindowSize, int searchWindowSize)
 {
-    Size src_size = _src.size();
-    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
-               src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
-                                        templateWindowSize, searchWindowSize, false))
-
-    Mat src = _src.getMat();
-    _dst.create(src_size, src.type());
-    Mat dst = _dst.getMat();
-
-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if(tegra::fastNlMeansDenoising(src, dst, h, templateWindowSize, searchWindowSize))
-        return;
-#endif
-
-    switch (src.type()) {
-        case CV_8U:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC2:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC3:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC4:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        default:
-            CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
-    }
+    fastNlMeansDenoising(_src, _dst, std::vector<float>(1, h),
+                         templateWindowSize, searchWindowSize);
 }
 
-void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float *h,
+void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, std::vector<float> h,
                                int templateWindowSize, int searchWindowSize)
 {
+    int hn = h.size();
+    CV_Assert(hn == 1 || hn == CV_MAT_CN(_src.type()));
+
     Size src_size = _src.size();
     CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, CV_MAT_CN(_src.type()),
+               ocl_fastNlMeansDenoising(_src, _dst, &h[0], hn,
                                         templateWindowSize, searchWindowSize, false))
 
     Mat src = _src.getMat();
@@ -111,23 +76,38 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float *h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                          FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
+                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC2:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         default:
             CV_Error(Error::StsBadArg,
@@ -138,70 +118,20 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float *h,
 void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
                                   int  templateWindowSize, int searchWindowSize)
 {
-    Size src_size = _src.size();
-    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
-               src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, &h, 1,
-                                        templateWindowSize, searchWindowSize, true))
-
-    Mat src = _src.getMat();
-    _dst.create(src_size, src.type());
-    Mat dst = _dst.getMat();
-
-    switch (src.type()) {
-        case CV_8U:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC2:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC3:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC4:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16U:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16UC2:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16UC3:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16UC4:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        default:
-            CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
-    }
+    fastNlMeansDenoisingAbs(_src, _dst, std::vector<float>(1, h),
+                            templateWindowSize, searchWindowSize);
 }
 
-void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float *h,
+void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, std::vector<float> h,
                                   int  templateWindowSize, int searchWindowSize)
 {
+    int hn = h.size();
+    CV_Assert(hn == 1 || hn == CV_MAT_CN(_src.type()));
+
     Size src_size = _src.size();
     CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, h, CV_MAT_CN(_src.type()),
+               ocl_fastNlMeansDenoising(_src, _dst, &h[0], hn,
                                         templateWindowSize, searchWindowSize, true))
 
     Mat src = _src.getMat();
@@ -211,43 +141,73 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float *h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                          FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
+                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC2:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16U:
             parallel_for_(cv::Range(0, src.rows),
                     FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+                    src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC2:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC3:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC4:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
-                    src, dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         default:
             CV_Error(Error::StsBadArg,
@@ -332,51 +292,14 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
                                     int imgToDenoiseIndex, int temporalWindowSize,
                                     float h, int templateWindowSize, int searchWindowSize)
 {
-    std::vector<Mat> srcImgs;
-    _srcImgs.getMatVector(srcImgs);
-
-    fastNlMeansDenoisingMultiCheckPreconditions(
-        srcImgs, imgToDenoiseIndex,
-        temporalWindowSize, templateWindowSize, searchWindowSize);
-
-    _dst.create(srcImgs[0].size(), srcImgs[0].type());
-    Mat dst = _dst.getMat();
-
-    switch (srcImgs[0].type())
-    {
-        case CV_8U:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC2:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC3:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC4:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        default:
-            CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
-    }
+    fastNlMeansDenoisingMulti(_srcImgs, _dst, imgToDenoiseIndex, temporalWindowSize,
+                              std::vector<float>(1, h), templateWindowSize, searchWindowSize);
 }
 
 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                     int imgToDenoiseIndex, int temporalWindowSize,
-                                    float *h, int templateWindowSize, int searchWindowSize)
+                                    std::vector<float> h,
+                                    int templateWindowSize, int searchWindowSize)
 {
     std::vector<Mat> srcImgs;
     _srcImgs.getMatVector(srcImgs);
@@ -385,6 +308,9 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
         srcImgs, imgToDenoiseIndex,
         temporalWindowSize, templateWindowSize, searchWindowSize);
 
+    int hn = h.size();
+    CV_Assert(hn == 1 || hn == CV_MAT_CN(srcImgs[0].type()));
+
     _dst.create(srcImgs[0].size(), srcImgs[0].type());
     Mat dst = _dst.getMat();
 
@@ -392,27 +318,45 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
     {
         case CV_8U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                          FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
+                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                              dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC2:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         default:
             CV_Error(Error::StsBadArg,
@@ -424,75 +368,14 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
                                        int imgToDenoiseIndex, int temporalWindowSize,
                                        float h, int templateWindowSize, int searchWindowSize)
 {
-    std::vector<Mat> srcImgs;
-    _srcImgs.getMatVector(srcImgs);
-
-    fastNlMeansDenoisingMultiCheckPreconditions(
-        srcImgs, imgToDenoiseIndex,
-        temporalWindowSize, templateWindowSize, searchWindowSize);
-
-    _dst.create(srcImgs[0].size(), srcImgs[0].type());
-    Mat dst = _dst.getMat();
-
-    switch (srcImgs[0].type())
-    {
-        case CV_8U:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC2:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC3:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_8UC4:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16U:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16UC2:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16UC3:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        case CV_16UC4:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h));
-            break;
-        default:
-            CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
-    }
+    fastNlMeansDenoisingMulti(_srcImgs, _dst, imgToDenoiseIndex, temporalWindowSize,
+                              std::vector<float>(1, h), templateWindowSize, searchWindowSize);
 }
 
 void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                        int imgToDenoiseIndex, int temporalWindowSize,
-                                       float *h, int templateWindowSize, int searchWindowSize)
+                                       std::vector<float> h,
+                                       int templateWindowSize, int searchWindowSize)
 {
     std::vector<Mat> srcImgs;
     _srcImgs.getMatVector(srcImgs);
@@ -501,6 +384,9 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
         srcImgs, imgToDenoiseIndex,
         temporalWindowSize, templateWindowSize, searchWindowSize);
 
+    int hn = h.size();
+    CV_Assert(hn == 1 || hn == CV_MAT_CN(srcImgs[0].type()));
+
     _dst.create(srcImgs[0].size(), srcImgs[0].type());
     Mat dst = _dst.getMat();
 
@@ -508,51 +394,87 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
     {
         case CV_8U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                          FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
+                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                              dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC2:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16U:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+                    dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC2:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC3:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC4:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, h));
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         default:
             CV_Error(Error::StsBadArg,
diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index 3b6998f06..360c16296 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -16,7 +16,7 @@ namespace ocl {
 PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
 {
     int cn, templateWindowSize, searchWindowSize;
-    float h[4];
+    std::vector<float> h;
     bool use_roi, use_image;
 
     TEST_DECLARE_INPUT_PARAMETER(src);
@@ -31,7 +31,7 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
         templateWindowSize = 7;
         searchWindowSize = 21;
 
-        ASSERT_TRUE(cn > 0 && cn <= 4);
+        h.resize(cn);
         for (int i=0; i<cn; i++)
             h[i] = 3.0f + 0.5f*i;
     }
@@ -51,6 +51,7 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
         Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
         randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255);
         if (use_image) {
+            ASSERT_TRUE(cn > 0 && cn <= 4);
             if (cn == 2) {
                 int from_to[] = { 0,0, 1,1 };
                 src_roi.create(roiSize, type);

From a594a0677afb7106791cfaf2e2d129fa0690d426 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 9 Mar 2015 16:00:24 +0100
Subject: [PATCH 31/40] Cleanup

---
 modules/photo/src/denoising.cpp | 72 ++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 7251b6446..c42ac4567 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -82,31 +82,31 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, std::vector<fl
         case CV_8UC2:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
+                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistSquared, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
+                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistSquared, Vec2i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
+                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistSquared, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
+                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistSquared, Vec3i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
+                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistSquared, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
+                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistSquared, Vec4i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         default:
@@ -147,31 +147,31 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, std::vector
         case CV_8UC2:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
+                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistAbs, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
+                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistAbs, Vec2i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
+                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistAbs, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
+                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistAbs, Vec3i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
+                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistAbs, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
+                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistAbs, Vec4i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16U:
@@ -182,31 +182,31 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, std::vector
         case CV_16UC2:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
+                              FastNlMeansDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
+                              FastNlMeansDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC3:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
+                              FastNlMeansDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
+                              FastNlMeansDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC4:
             if (hn == 1)
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
+                              FastNlMeansDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, int>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
+                              FastNlMeansDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
                                   src, dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         default:
@@ -325,36 +325,36 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
         case CV_8UC2:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistSquared, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistSquared, Vec2i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistSquared, Vec2i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistSquared, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistSquared, Vec3i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistSquared, Vec3i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistSquared, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistSquared, Vec4i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistSquared, Vec4i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
@@ -401,36 +401,36 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
         case CV_8UC2:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistAbs, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned, DistAbs, Vec2i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistAbs, Vec2i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC3:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistAbs, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned, DistAbs, Vec3i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistAbs, Vec3i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_8UC4:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistAbs, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec4b, int, unsigned, DistAbs, Vec4i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistAbs, Vec4i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
@@ -443,36 +443,36 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
         case CV_16UC2:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC3:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;
         case CV_16UC4:
             if (hn == 1)
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, int>(
+                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, int>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             else
                 parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
+                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
                                   srcImgs, imgToDenoiseIndex, temporalWindowSize,
                                   dst, templateWindowSize, searchWindowSize, &h[0]));
             break;

From 4b5753daea25f0ba439d7f82f7d320d9ff743d8a Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 9 Mar 2015 16:11:18 +0100
Subject: [PATCH 32/40] Corrected documentation

---
 modules/photo/include/opencv2/photo.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index d613c2420..1867d3ef0 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -150,7 +150,7 @@ Should be odd. Recommended value 7 pixels
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
 @param h Array of parameters regulating filter strength, either one
-parameter applied to all channels or one per channel in src. Big h value
+parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
 
@@ -202,7 +202,7 @@ Should be odd. Recommended value 7 pixels
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
 @param h Array of parameters regulating filter strength, either one
-parameter applied to all channels or one per channel in src. Big h value
+parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
 
@@ -284,7 +284,7 @@ Should be odd. Recommended value 7 pixels
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
 @param h Array of parameters regulating filter strength, either one
-parameter applied to all channels or one per channel in src. Big h value
+parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
  */
@@ -347,7 +347,7 @@ Should be odd. Recommended value 7 pixels
 given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
 denoising time. Recommended value 21 pixels
 @param h Array of parameters regulating filter strength, either one
-parameter applied to all channels or one per channel in src. Big h value
+parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
  */

From b471f9ee2622641426e7e71978ab9e1ba181841b Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 9 Mar 2015 18:52:25 +0100
Subject: [PATCH 33/40] Fixed call to tegra::fastNlMeansDenoising

---
 modules/photo/src/denoising.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index c42ac4567..5445d26cd 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -69,7 +69,7 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, std::vector<fl
     Mat dst = _dst.getMat();
 
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(tegra::fastNlMeansDenoising(src, dst, h, templateWindowSize, searchWindowSize))
+    if(hn == 1 && tegra::fastNlMeansDenoising(src, dst, h[0], templateWindowSize, searchWindowSize))
         return;
 #endif
 

From 8aa07839954f6d738e4692e86642acf6ca35cd1a Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 9 Mar 2015 23:47:49 +0100
Subject: [PATCH 34/40] Changed parameter type of
 fastNlMeansDenoising[Multi][Abs] from std::vector<float> to const
 std::vector<float>&

---
 modules/photo/include/opencv2/photo.hpp            | 14 ++++++++------
 modules/photo/src/denoising.cpp                    |  8 ++++----
 .../photo/src/fast_nlmeans_denoising_opencl.hpp    |  4 ++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index 1867d3ef0..ff98ba74f 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -160,8 +160,9 @@ image in different colorspaces. Such approach is used in fastNlMeansDenoisingCol
 image to CIELAB colorspace and then separately denoise L and AB components with different h
 parameter.
  */
-CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, std::vector<float> h,
-        int templateWindowSize = 7, int searchWindowSize = 21);
+    CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
+                                        const std::vector<float>& h,
+                                        int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Perform image denoising using Non-local Means Denoising
 algorithm <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/>
@@ -212,8 +213,9 @@ image in different colorspaces. Such approach is used in fastNlMeansDenoisingCol
 image to CIELAB colorspace and then separately denoise L and AB components with different h
 parameter.
  */
-CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst, std::vector<float> h,
-        int templateWindowSize = 7, int searchWindowSize = 21);
+CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst,
+                                           const std::vector<float>& h,
+                                           int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Modification of fastNlMeansDenoising function for colored images
 
@@ -290,7 +292,7 @@ value preserves details but also preserves some noise
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
         int imgToDenoiseIndex, int temporalWindowSize,
-        std::vector<float> h , int templateWindowSize = 7, int searchWindowSize = 21);
+        const std::vector<float>& h , int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Modification of fastNlMeansDenoising function for images
 sequence where consequtive images have been captured in small period
@@ -353,7 +355,7 @@ value preserves details but also preserves some noise
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMultiAbs( InputArrayOfArrays srcImgs, OutputArray dst,
         int imgToDenoiseIndex, int temporalWindowSize,
-        std::vector<float> h, int templateWindowSize = 7, int searchWindowSize = 21);
+        const std::vector<float>& h, int templateWindowSize = 7, int searchWindowSize = 21);
 
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
 
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 7dde96081..5243b4330 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -52,7 +52,7 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
                          templateWindowSize, searchWindowSize);
 }
 
-void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, std::vector<float> h,
+void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, const std::vector<float>& h,
                                int templateWindowSize, int searchWindowSize)
 {
     int hn = h.size();
@@ -123,7 +123,7 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
                             templateWindowSize, searchWindowSize);
 }
 
-void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, std::vector<float> h,
+void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, const std::vector<float>& h,
                                   int  templateWindowSize, int searchWindowSize)
 {
     int hn = h.size();
@@ -299,7 +299,7 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
 
 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                     int imgToDenoiseIndex, int temporalWindowSize,
-                                    std::vector<float> h,
+                                    const std::vector<float>& h,
                                     int templateWindowSize, int searchWindowSize)
 {
     std::vector<Mat> srcImgs;
@@ -375,7 +375,7 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
 
 void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                        int imgToDenoiseIndex, int temporalWindowSize,
-                                       std::vector<float> h,
+                                       const std::vector<float>& h,
                                        int templateWindowSize, int searchWindowSize)
 {
     std::vector<Mat> srcImgs;
diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index a06dc6192..b7fdc7cf9 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -29,7 +29,7 @@ static int divUp(int a, int b)
 }
 
 template <typename FT, typename ST, typename WT>
-static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT *h, int hn, int cn,
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, const FT *h, int hn, int cn,
                                       int & almostTemplateWindowSizeSqBinShift, bool abs)
 {
     const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
@@ -78,7 +78,7 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
     return k.run(1, globalsize, NULL, false);
 }
 
-static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float *h, int hn,
+static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
                                      int templateWindowSize, int searchWindowSize, bool abs)
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);

From 812edb5fdc8eaa72151994280fb673a0d48fb62b Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 10 Mar 2015 01:34:02 +0100
Subject: [PATCH 35/40] Fixed bug, maxDist() should reurn int, not double

---
 modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
index 9833ea7d3..8f31e8b02 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp
@@ -176,7 +176,7 @@ public:
     }
 
     template <typename T>
-    static inline double maxDist()
+    static inline int maxDist()
     {
         return (int)pixelInfo<T>::sampleMax() * pixelInfo<T>::channels;
     }
@@ -298,7 +298,7 @@ public:
     }
 
     template <typename T>
-    static inline double maxDist()
+    static inline int maxDist()
     {
         return (int)pixelInfo<T>::sampleMax() * (int)pixelInfo<T>::sampleMax() *
             pixelInfo<T>::channels;

From 82c54104d6901e03027240cd9c6866f6b2509d0a Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 10 Mar 2015 01:39:43 +0100
Subject: [PATCH 36/40] Fix warnings on Win x64

---
 modules/photo/src/denoising.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 5243b4330..fb3889339 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -55,7 +55,7 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, const std::vector<float>& h,
                                int templateWindowSize, int searchWindowSize)
 {
-    int hn = h.size();
+    int hn = (int)h.size();
     CV_Assert(hn == 1 || hn == CV_MAT_CN(_src.type()));
 
     Size src_size = _src.size();
@@ -126,7 +126,7 @@ void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
 void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, const std::vector<float>& h,
                                   int  templateWindowSize, int searchWindowSize)
 {
-    int hn = h.size();
+    int hn = (int)h.size();
     CV_Assert(hn == 1 || hn == CV_MAT_CN(_src.type()));
 
     Size src_size = _src.size();
@@ -309,7 +309,7 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
         srcImgs, imgToDenoiseIndex,
         temporalWindowSize, templateWindowSize, searchWindowSize);
 
-    int hn = h.size();
+    int hn = (int)h.size();
     CV_Assert(hn == 1 || hn == CV_MAT_CN(srcImgs[0].type()));
 
     _dst.create(srcImgs[0].size(), srcImgs[0].type());
@@ -385,7 +385,7 @@ void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray
         srcImgs, imgToDenoiseIndex,
         temporalWindowSize, templateWindowSize, searchWindowSize);
 
-    int hn = h.size();
+    int hn = (int)h.size();
     CV_Assert(hn == 1 || hn == CV_MAT_CN(srcImgs[0].type()));
 
     _dst.create(srcImgs[0].size(), srcImgs[0].type());

From 9fff7896c528b44df92fe63b4bde59f85f98e1be Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Mon, 23 Mar 2015 23:23:35 +0100
Subject: [PATCH 37/40] Removed fastNlMeansDenoisingColored[Multi]Abs

---
 modules/photo/include/opencv2/photo.hpp   | 118 +-----------
 modules/photo/src/denoising.cpp           | 218 ----------------------
 modules/photo/test/ocl/test_denoising.cpp |  34 ----
 3 files changed, 1 insertion(+), 369 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index ff98ba74f..85336c936 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -160,63 +160,10 @@ image in different colorspaces. Such approach is used in fastNlMeansDenoisingCol
 image to CIELAB colorspace and then separately denoise L and AB components with different h
 parameter.
  */
-    CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
+CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
                                         const std::vector<float>& h,
                                         int templateWindowSize = 7, int searchWindowSize = 21);
 
-/** @brief Perform image denoising using Non-local Means Denoising
-algorithm <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/>
-with several computational optimizations. Noise expected to be a
-gaussian white noise. Uses squared sum of absolute value distances
-instead of sum of squared distances for weight calculation
-
-@param src Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel or 4-channel image.
-@param dst Output image with the same size and type as src .
-@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
-Should be odd. Recommended value 7 pixels
-@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
-given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
-denoising time. Recommended value 21 pixels
-@param h Parameter regulating filter strength. Big h value perfectly removes noise but also
-removes image details, smaller h value preserves details but also preserves some noise
-
-This function expected to be applied to grayscale images. For colored images look at
-fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
-image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
-image to CIELAB colorspace and then separately denoise L and AB components with different h
-parameter.
- */
-CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst, float h = 3,
-        int templateWindowSize = 7, int searchWindowSize = 21);
-
-/** @brief Perform image denoising using Non-local Means Denoising
-algorithm <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/>
-with several computational optimizations. Noise expected to be a
-gaussian white noise. Uses squared sum of absolute value distances
-instead of sum of squared distances for weight calculation
-
-@param src Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel or 4-channel image.
-@param dst Output image with the same size and type as src .
-@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
-Should be odd. Recommended value 7 pixels
-@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
-given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
-denoising time. Recommended value 21 pixels
-@param h Array of parameters regulating filter strength, either one
-parameter applied to all channels or one per channel in dst. Big h value
-perfectly removes noise but also removes image details, smaller h
-value preserves details but also preserves some noise
-
-This function expected to be applied to grayscale images. For colored images look at
-fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
-image in different colorspaces. Such approach is used in fastNlMeansDenoisingColored by converting
-image to CIELAB colorspace and then separately denoise L and AB components with different h
-parameter.
- */
-CV_EXPORTS_W void fastNlMeansDenoisingAbs( InputArray src, OutputArray dst,
-                                           const std::vector<float>& h,
-                                           int templateWindowSize = 7, int searchWindowSize = 21);
-
 /** @brief Modification of fastNlMeansDenoising function for colored images
 
 @param src Input 8-bit 3-channel image.
@@ -294,69 +241,6 @@ CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputA
         int imgToDenoiseIndex, int temporalWindowSize,
         const std::vector<float>& h , int templateWindowSize = 7, int searchWindowSize = 21);
 
-/** @brief Modification of fastNlMeansDenoising function for images
-sequence where consequtive images have been captured in small period
-of time. For example video. This version of the function is for
-grayscale images or for manual manipulation with colorspaces. For more
-details see
-<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>. Uses
-squared sum of absolute value distances instead of sum of squared
-distances for weight calculation
-
-@param srcImgs Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel
-or 4-channel images sequence. All images should have the same type and
-size.
-@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
-@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
-be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
-imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
-srcImgs[imgToDenoiseIndex] image.
-@param dst Output image with the same size and type as srcImgs images.
-@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
-Should be odd. Recommended value 7 pixels
-@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
-given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
-denoising time. Recommended value 21 pixels
-@param h Parameter regulating filter strength. Bigger h value
-perfectly removes noise but also removes image details, smaller h
-value preserves details but also preserves some noise
- */
-CV_EXPORTS_W void fastNlMeansDenoisingMultiAbs( InputArrayOfArrays srcImgs, OutputArray dst,
-        int imgToDenoiseIndex, int temporalWindowSize,
-        float h = 3, int templateWindowSize = 7, int searchWindowSize = 21);
-
-/** @brief Modification of fastNlMeansDenoising function for images
-sequence where consequtive images have been captured in small period
-of time. For example video. This version of the function is for
-grayscale images or for manual manipulation with colorspaces. For more
-details see
-<http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>. Uses
-squared sum of absolute value distances instead of sum of squared
-distances for weight calculation
-
-@param srcImgs Input 8-bit or 16-bit 1-channel, 2-channel, 3-channel
-or 4-channel images sequence. All images should have the same type and
-size.
-@param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
-@param temporalWindowSize Number of surrounding images to use for target image denoising. Should
-be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
-imgToDenoiseIndex - temporalWindowSize / 2 from srcImgs will be used to denoise
-srcImgs[imgToDenoiseIndex] image.
-@param dst Output image with the same size and type as srcImgs images.
-@param templateWindowSize Size in pixels of the template patch that is used to compute weights.
-Should be odd. Recommended value 7 pixels
-@param searchWindowSize Size in pixels of the window that is used to compute weighted average for
-given pixel. Should be odd. Affect performance linearly: greater searchWindowsSize - greater
-denoising time. Recommended value 21 pixels
-@param h Array of parameters regulating filter strength, either one
-parameter applied to all channels or one per channel in dst. Big h value
-perfectly removes noise but also removes image details, smaller h
-value preserves details but also preserves some noise
- */
-CV_EXPORTS_W void fastNlMeansDenoisingMultiAbs( InputArrayOfArrays srcImgs, OutputArray dst,
-        int imgToDenoiseIndex, int temporalWindowSize,
-        const std::vector<float>& h, int templateWindowSize = 7, int searchWindowSize = 21);
-
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
 
 @param srcImgs Input 8-bit 3-channel images sequence. All images should have the same type and
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index fb3889339..cd8a751f6 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -116,106 +116,6 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, const std::vec
     }
 }
 
-void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, float h,
-                                  int  templateWindowSize, int searchWindowSize)
-{
-    fastNlMeansDenoisingAbs(_src, _dst, std::vector<float>(1, h),
-                            templateWindowSize, searchWindowSize);
-}
-
-void cv::fastNlMeansDenoisingAbs( InputArray _src, OutputArray _dst, const std::vector<float>& h,
-                                  int  templateWindowSize, int searchWindowSize)
-{
-    int hn = (int)h.size();
-    CV_Assert(hn == 1 || hn == CV_MAT_CN(_src.type()));
-
-    Size src_size = _src.size();
-    CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
-               src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
-               ocl_fastNlMeansDenoising(_src, _dst, &h[0], hn,
-                                        templateWindowSize, searchWindowSize, true))
-
-    Mat src = _src.getMat();
-    _dst.create(src_size, src.type());
-    Mat dst = _dst.getMat();
-
-    switch (src.type()) {
-        case CV_8U:
-            parallel_for_(cv::Range(0, src.rows),
-                          FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
-                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC2:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistAbs, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistAbs, Vec2i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC3:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistAbs, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistAbs, Vec3i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC4:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistAbs, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistAbs, Vec4i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16U:
-            parallel_for_(cv::Range(0, src.rows),
-                    FastNlMeansDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
-                    src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16UC2:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16UC3:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16UC4:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        default:
-            CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
-    }
-}
-
 void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
                                       float h, float hForColorComponents,
                                       int templateWindowSize, int searchWindowSize)
@@ -365,124 +265,6 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
     }
 }
 
-void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray _dst,
-                                       int imgToDenoiseIndex, int temporalWindowSize,
-                                       float h, int templateWindowSize, int searchWindowSize)
-{
-    fastNlMeansDenoisingMulti(_srcImgs, _dst, imgToDenoiseIndex, temporalWindowSize,
-                              std::vector<float>(1, h), templateWindowSize, searchWindowSize);
-}
-
-void cv::fastNlMeansDenoisingMultiAbs( InputArrayOfArrays _srcImgs, OutputArray _dst,
-                                       int imgToDenoiseIndex, int temporalWindowSize,
-                                       const std::vector<float>& h,
-                                       int templateWindowSize, int searchWindowSize)
-{
-    std::vector<Mat> srcImgs;
-    _srcImgs.getMatVector(srcImgs);
-
-    fastNlMeansDenoisingMultiCheckPreconditions(
-        srcImgs, imgToDenoiseIndex,
-        temporalWindowSize, templateWindowSize, searchWindowSize);
-
-    int hn = (int)h.size();
-    CV_Assert(hn == 1 || hn == CV_MAT_CN(srcImgs[0].type()));
-
-    _dst.create(srcImgs[0].size(), srcImgs[0].type());
-    Mat dst = _dst.getMat();
-
-    switch (srcImgs[0].type())
-    {
-        case CV_8U:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                          FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistAbs, int>(
-                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                              dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC2:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistAbs, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistAbs, Vec2i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC3:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistAbs, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistAbs, Vec3i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC4:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistAbs, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistAbs, Vec4i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16U:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64, DistAbs, int>(
-                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                    dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16UC2:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 2>, int64, uint64, DistAbs, Vec2i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16UC3:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 3>, int64, uint64, DistAbs, Vec3i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_16UC4:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec<ushort, 4>, int64, uint64, DistAbs, Vec4i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        default:
-            CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3, CV_8UC4, CV_16U, CV_16UC2, CV_16UC3 and CV_16UC4 are supported");
-    }
-}
-
 void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                            int imgToDenoiseIndex, int temporalWindowSize,
                                            float h, float hForColorComponents,
diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index 360c16296..55b5a9e59 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -103,36 +103,6 @@ OCL_TEST_P(FastNlMeansDenoising_hsep, Mat)
     }
 }
 
-typedef FastNlMeansDenoisingTestBase FastNlMeansDenoisingAbs;
-
-OCL_TEST_P(FastNlMeansDenoisingAbs, Mat)
-{
-    for (int j = 0; j < test_loop_times; j++)
-    {
-        generateTestData();
-
-        OCL_OFF(cv::fastNlMeansDenoisingAbs(src_roi, dst_roi, h[0], templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoisingAbs(usrc_roi, udst_roi, h[0], templateWindowSize, searchWindowSize));
-
-        OCL_EXPECT_MATS_NEAR(dst, 1);
-    }
-}
-
-typedef FastNlMeansDenoisingTestBase FastNlMeansDenoisingAbs_hsep;
-
-OCL_TEST_P(FastNlMeansDenoisingAbs_hsep, Mat)
-{
-    for (int j = 0; j < test_loop_times; j++)
-    {
-        generateTestData();
-
-        OCL_OFF(cv::fastNlMeansDenoisingAbs(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoisingAbs(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
-
-        OCL_EXPECT_MATS_NEAR(dst, 1);
-    }
-}
-
 typedef FastNlMeansDenoisingTestBase FastNlMeansDenoisingColored;
 
 OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
@@ -152,10 +122,6 @@ OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
                             Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising_hsep,
                             Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingAbs,
-                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
-OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingAbs_hsep,
-                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
                             Combine(Values(3, 4), Bool(), Values(false)));
 

From 5f8d688664b3a1466f587a0be324463149347909 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 24 Mar 2015 00:47:11 +0100
Subject: [PATCH 38/40] Added parameter normType to fastNlMeansDenoising

---
 modules/photo/include/opencv2/photo.hpp       |   4 +-
 modules/photo/src/denoising.cpp               | 127 ++++++++++++------
 .../src/fast_nlmeans_denoising_opencl.hpp     |  29 ++--
 3 files changed, 103 insertions(+), 57 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index 85336c936..0a42424cb 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -153,6 +153,7 @@ denoising time. Recommended value 21 pixels
 parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
+@param normType Type of norm used for weight calcluation. Can be either NORM_L2 or NORM_L1
 
 This function expected to be applied to grayscale images. For colored images look at
 fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
@@ -162,7 +163,8 @@ parameter.
  */
 CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst,
                                         const std::vector<float>& h,
-                                        int templateWindowSize = 7, int searchWindowSize = 21);
+                                        int templateWindowSize = 7, int searchWindowSize = 21,
+                                        int normType = NORM_L2);
 
 /** @brief Modification of fastNlMeansDenoising function for colored images
 
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index cd8a751f6..4e7922e40 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -45,6 +45,54 @@
 #include "fast_nlmeans_multi_denoising_invoker.hpp"
 #include "fast_nlmeans_denoising_opencl.hpp"
 
+template<typename ST, typename IT, typename UIT, typename D>
+static void fastNlMeansDenoising_( const Mat& src, Mat& dst, const std::vector<float>& h,
+                                   int templateWindowSize, int searchWindowSize)
+{
+    int hn = (int)h.size();
+
+    switch (CV_MAT_CN(src.type())) {
+        case 1:
+            parallel_for_(cv::Range(0, src.rows),
+                          FastNlMeansDenoisingInvoker<ST, IT, UIT, D, int>(
+                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case 2:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case 3:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case 4:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, src.rows),
+                              FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
+                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                     "Unsupported number of channels! Only 1, 2, 3, and 4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
                                int templateWindowSize, int searchWindowSize)
 {
@@ -53,66 +101,59 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
 }
 
 void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, const std::vector<float>& h,
-                               int templateWindowSize, int searchWindowSize)
+                               int templateWindowSize, int searchWindowSize, int normType)
 {
-    int hn = (int)h.size();
-    CV_Assert(hn == 1 || hn == CV_MAT_CN(_src.type()));
+    int hn = (int)h.size(), type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert(hn == 1 || hn == cn);
 
     Size src_size = _src.size();
     CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()) &&
                src_size.width > 5 && src_size.height > 5, // low accuracy on small sizes
                ocl_fastNlMeansDenoising(_src, _dst, &h[0], hn,
-                                        templateWindowSize, searchWindowSize, false))
+                                        templateWindowSize, searchWindowSize, normType))
 
     Mat src = _src.getMat();
     _dst.create(src_size, src.type());
     Mat dst = _dst.getMat();
 
+    switch (normType) {
+        case NORM_L2:
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if(hn == 1 && tegra::useTegra() &&
-       tegra::fastNlMeansDenoising(src, dst, h[0], templateWindowSize, searchWindowSize))
-        return;
+            if(hn == 1 && tegra::useTegra() &&
+               tegra::fastNlMeansDenoising(src, dst, h[0], templateWindowSize, searchWindowSize))
+                return;
 #endif
-
-    switch (src.type()) {
-        case CV_8U:
-            parallel_for_(cv::Range(0, src.rows),
-                          FastNlMeansDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
-                              src, dst, templateWindowSize, searchWindowSize, &h[0]));
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoising_<uchar, int, unsigned, DistSquared>(src, dst, h,
+                                                                             templateWindowSize,
+                                                                             searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U is supported for NORM_L2");
+            }
             break;
-        case CV_8UC2:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistSquared, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec2b, int, unsigned, DistSquared, Vec2i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC3:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistSquared, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec3b, int, unsigned, DistSquared, Vec3i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC4:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistSquared, int>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, src.rows),
-                              FastNlMeansDenoisingInvoker<Vec4b, int, unsigned, DistSquared, Vec4i>(
-                                  src, dst, templateWindowSize, searchWindowSize, &h[0]));
+        case NORM_L1:
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoising_<uchar, int, unsigned, DistAbs>(src, dst, h,
+                                                                         templateWindowSize,
+                                                                         searchWindowSize);
+                    break;
+                case CV_16U:
+                    fastNlMeansDenoising_<ushort, int64, uint64, DistAbs>(src, dst, h,
+                                                                          templateWindowSize,
+                                                                          searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U and CV_16U are supported for NORM_L1");
+            }
             break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
+                     "Unsupported norm type! Only NORM_L2 and NORM_L1 are supported");
     }
 }
 
diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
index b7fdc7cf9..1c511f37b 100644
--- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp
@@ -29,8 +29,10 @@ static int divUp(int a, int b)
 }
 
 template <typename FT, typename ST, typename WT>
-static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, const FT *h, int hn, int cn,
-                                      int & almostTemplateWindowSizeSqBinShift, bool abs)
+static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
+                                      int searchWindowSize, int templateWindowSize,
+                                      const FT *h, int hn, int cn, int normType,
+                                      int & almostTemplateWindowSizeSqBinShift)
 {
     const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
         std::numeric_limits<ST>::max();
@@ -50,7 +52,7 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
     FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
 
     const FT WEIGHT_THRESHOLD = 1e-3f;
-    int maxDist = abs ? std::numeric_limits<ST>::max() * cn :
+    int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
         std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
     int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
     FT den[4];
@@ -66,7 +68,8 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
                          " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
                          ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
                          ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : "", abs ? " -D ABS" : ""));
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+                         normType == NORM_L1 ? " -D ABS" : ""));
     if (k.empty())
         return false;
 
@@ -79,13 +82,14 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow
 }
 
 static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
-                                     int templateWindowSize, int searchWindowSize, bool abs)
+                                     int templateWindowSize, int searchWindowSize, int normType)
 {
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
     Size size = _src.size();
 
-    if (cn != 1 && cn != 2 && cn != 3 && cn != 4 && depth != CV_8U && (!abs || depth != CV_16U))
+    if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
+                             (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
         return false;
 
     int templateWindowHalfWize = templateWindowSize / 2;
@@ -117,7 +121,8 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const fl
                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
                          ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
                          (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
-                         ocl::convertTypeStr(CV_32S, depth, cn, buf[3]), abs ? " -D ABS" : "");
+                         ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
+                         normType == NORM_L1 ? " -D ABS" : "");
 
     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
     if (k.empty())
@@ -127,15 +132,13 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const fl
     if ((depth == CV_8U &&
          !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
                                                        searchWindowSize, templateWindowSize,
-                                                       h, hn, cn,
-                                                       almostTemplateWindowSizeSqBinShift,
-                                                       abs)) ||
+                                                       h, hn, cn, normType,
+                                                       almostTemplateWindowSizeSqBinShift)) ||
         (depth == CV_16U &&
          !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
                                                           searchWindowSize, templateWindowSize,
-                                                          h, hn, cn,
-                                                          almostTemplateWindowSizeSqBinShift,
-                                                          abs)))
+                                                          h, hn, cn, normType,
+                                                          almostTemplateWindowSizeSqBinShift)))
         return false;
     CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
 

From 70a64ebe728584e3223e6984ecc04998b043d405 Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 24 Mar 2015 01:16:17 +0100
Subject: [PATCH 39/40] Added test cases

---
 modules/photo/test/ocl/test_denoising.cpp | 25 +++++++++++++----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp
index 55b5a9e59..f749564c6 100644
--- a/modules/photo/test/ocl/test_denoising.cpp
+++ b/modules/photo/test/ocl/test_denoising.cpp
@@ -13,9 +13,9 @@
 namespace cvtest {
 namespace ocl {
 
-PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
+PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, int, bool, bool)
 {
-    int cn, templateWindowSize, searchWindowSize;
+    int cn, normType, templateWindowSize, searchWindowSize;
     std::vector<float> h;
     bool use_roi, use_image;
 
@@ -25,8 +25,9 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool, bool)
     virtual void SetUp()
     {
         cn = GET_PARAM(0);
-        use_roi = GET_PARAM(1);
-        use_image = GET_PARAM(2);
+        normType = GET_PARAM(1);
+        use_roi = GET_PARAM(2);
+        use_image = GET_PARAM(3);
 
         templateWindowSize = 7;
         searchWindowSize = 21;
@@ -81,8 +82,8 @@ OCL_TEST_P(FastNlMeansDenoising, Mat)
     {
         generateTestData();
 
-        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h[0], templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h[0], templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, std::vector<float>(1, h[0]), templateWindowSize, searchWindowSize, normType));
 
         OCL_EXPECT_MATS_NEAR(dst, 1);
     }
@@ -96,8 +97,8 @@ OCL_TEST_P(FastNlMeansDenoising_hsep, Mat)
     {
         generateTestData();
 
-        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize));
-        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize));
+        OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize, normType));
+        OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize, normType));
 
         OCL_EXPECT_MATS_NEAR(dst, 1);
     }
@@ -119,11 +120,13 @@ OCL_TEST_P(FastNlMeansDenoisingColored, Mat)
 }
 
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising,
-                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
+                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
+                                    Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising_hsep,
-                            Combine(Values(1, 2, 3, 4), Bool(), Values(true)));
+                            Combine(Values(1, 2, 3, 4), Values((int)NORM_L2, (int)NORM_L1),
+                                    Bool(), Values(true)));
 OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoisingColored,
-                            Combine(Values(3, 4), Bool(), Values(false)));
+                            Combine(Values(3, 4), Values((int)NORM_L2), Bool(), Values(false)));
 
 } } // namespace cvtest::ocl
 

From 01d3df0d00e5c802108b90bd2dedb50e9a9ecacf Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Tue, 24 Mar 2015 02:01:31 +0100
Subject: [PATCH 40/40] Added normType parameter to fastNlMeansDenoisingMulti

---
 modules/photo/include/opencv2/photo.hpp |  18 +--
 modules/photo/src/denoising.cpp         | 141 ++++++++++++++++--------
 2 files changed, 107 insertions(+), 52 deletions(-)

diff --git a/modules/photo/include/opencv2/photo.hpp b/modules/photo/include/opencv2/photo.hpp
index 0a42424cb..c651b9ee3 100644
--- a/modules/photo/include/opencv2/photo.hpp
+++ b/modules/photo/include/opencv2/photo.hpp
@@ -142,7 +142,8 @@ CV_EXPORTS_W void fastNlMeansDenoising( InputArray src, OutputArray dst, float h
 <http://www.ipol.im/pub/algo/bcm_non_local_means_denoising/> with several computational
 optimizations. Noise expected to be a gaussian white noise
 
-@param src Input 8-bit 1-channel, 2-channel, 3-channel or 4-channel image.
+@param src Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel image.
 @param dst Output image with the same size and type as src .
 @param templateWindowSize Size in pixels of the template patch that is used to compute weights.
 Should be odd. Recommended value 7 pixels
@@ -153,7 +154,7 @@ denoising time. Recommended value 21 pixels
 parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
-@param normType Type of norm used for weight calcluation. Can be either NORM_L2 or NORM_L1
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
 
 This function expected to be applied to grayscale images. For colored images look at
 fastNlMeansDenoisingColored. Advanced usage of this functions can be manual denoising of colored
@@ -220,9 +221,9 @@ captured in small period of time. For example video. This version of the functio
 images or for manual manipulation with colorspaces. For more details see
 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.131.6394>
 
-@param srcImgs Input 8-bit 1-channel, 2-channel, 3-channel or
-4-channel images sequence. All images should have the same type and
-size.
+@param srcImgs Input 8-bit or 16-bit (only with NORM_L1) 1-channel,
+2-channel, 3-channel or 4-channel images sequence. All images should
+have the same type and size.
 @param imgToDenoiseIndex Target image to denoise index in srcImgs sequence
 @param temporalWindowSize Number of surrounding images to use for target image denoising. Should
 be odd. Images from imgToDenoiseIndex - temporalWindowSize / 2 to
@@ -238,10 +239,13 @@ denoising time. Recommended value 21 pixels
 parameter applied to all channels or one per channel in dst. Big h value
 perfectly removes noise but also removes image details, smaller h
 value preserves details but also preserves some noise
+@param normType Type of norm used for weight calculation. Can be either NORM_L2 or NORM_L1
  */
 CV_EXPORTS_W void fastNlMeansDenoisingMulti( InputArrayOfArrays srcImgs, OutputArray dst,
-        int imgToDenoiseIndex, int temporalWindowSize,
-        const std::vector<float>& h , int templateWindowSize = 7, int searchWindowSize = 21);
+                                             int imgToDenoiseIndex, int temporalWindowSize,
+                                             const std::vector<float>& h,
+                                             int templateWindowSize = 7, int searchWindowSize = 21,
+                                             int normType = NORM_L2);
 
 /** @brief Modification of fastNlMeansDenoisingMulti function for colored images sequences
 
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 4e7922e40..c68d09b92 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -230,6 +230,64 @@ static void fastNlMeansDenoisingMultiCheckPreconditions(
         }
 }
 
+template<typename ST, typename IT, typename UIT, typename D>
+static void fastNlMeansDenoisingMulti_( const std::vector<Mat>& srcImgs, Mat& dst,
+                                        int imgToDenoiseIndex, int temporalWindowSize,
+                                        const std::vector<float>& h,
+                                        int templateWindowSize, int searchWindowSize)
+{
+    int hn = (int)h.size();
+
+    switch (srcImgs[0].type())
+    {
+        case CV_8U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                          FastNlMeansMultiDenoisingInvoker<uchar, IT, UIT, D, int>(
+                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                              dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case CV_8UC2:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case CV_8UC3:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        case CV_8UC4:
+            if (hn == 1)
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            else
+                parallel_for_(cv::Range(0, srcImgs[0].rows),
+                              FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
+                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+            break;
+        default:
+            CV_Error(Error::StsBadArg,
+                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
+    }
+}
+
 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                     int imgToDenoiseIndex, int temporalWindowSize,
                                     float h, int templateWindowSize, int searchWindowSize)
@@ -241,7 +299,7 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
 void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst,
                                     int imgToDenoiseIndex, int temporalWindowSize,
                                     const std::vector<float>& h,
-                                    int templateWindowSize, int searchWindowSize)
+                                    int templateWindowSize, int searchWindowSize, int normType)
 {
     std::vector<Mat> srcImgs;
     _srcImgs.getMatVector(srcImgs);
@@ -251,58 +309,51 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
         temporalWindowSize, templateWindowSize, searchWindowSize);
 
     int hn = (int)h.size();
-    CV_Assert(hn == 1 || hn == CV_MAT_CN(srcImgs[0].type()));
+    int type = srcImgs[0].type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    CV_Assert(hn == 1 || hn == cn);
 
     _dst.create(srcImgs[0].size(), srcImgs[0].type());
     Mat dst = _dst.getMat();
 
-    switch (srcImgs[0].type())
-    {
-        case CV_8U:
-            parallel_for_(cv::Range(0, srcImgs[0].rows),
-                          FastNlMeansMultiDenoisingInvoker<uchar, int, unsigned, DistSquared, int>(
-                              srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                              dst, templateWindowSize, searchWindowSize, &h[0]));
+    switch (normType) {
+        case NORM_L2:
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoisingMulti_<uchar, int, unsigned,
+                                               DistSquared>(srcImgs, dst,
+                                                            imgToDenoiseIndex, temporalWindowSize,
+                                                            h,
+                                                            templateWindowSize, searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U is supported for NORM_L2");
+            }
             break;
-        case CV_8UC2:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistSquared, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec2b, int, unsigned, DistSquared, Vec2i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC3:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistSquared, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec3b, int, unsigned, DistSquared, Vec3i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            break;
-        case CV_8UC4:
-            if (hn == 1)
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistSquared, int>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
-            else
-                parallel_for_(cv::Range(0, srcImgs[0].rows),
-                              FastNlMeansMultiDenoisingInvoker<Vec4b, int, unsigned, DistSquared, Vec4i>(
-                                  srcImgs, imgToDenoiseIndex, temporalWindowSize,
-                                  dst, templateWindowSize, searchWindowSize, &h[0]));
+        case NORM_L1:
+            switch (depth) {
+                case CV_8U:
+                    fastNlMeansDenoisingMulti_<uchar, int, unsigned,
+                                               DistAbs>(srcImgs, dst,
+                                                        imgToDenoiseIndex, temporalWindowSize,
+                                                        h,
+                                                        templateWindowSize, searchWindowSize);
+                    break;
+                case CV_16U:
+                    fastNlMeansDenoisingMulti_<ushort, int64, uint64,
+                                               DistAbs>(srcImgs, dst,
+                                                        imgToDenoiseIndex, temporalWindowSize,
+                                                        h,
+                                                        templateWindowSize, searchWindowSize);
+                    break;
+                default:
+                    CV_Error(Error::StsBadArg,
+                             "Unsupported depth! Only CV_8U and CV_16U are supported for NORM_L1");
+            }
             break;
         default:
             CV_Error(Error::StsBadArg,
-                "Unsupported image format! Only CV_8U, CV_8UC2, CV_8UC3 and CV_8UC4 are supported");
+                     "Unsupported norm type! Only NORM_L2 and NORM_L1 are supported");
     }
 }