From 42db9e7153a6d10b429df0bc2108278251c11ebc Mon Sep 17 00:00:00 2001
From: Erik Karlsson <erik.r.karlsson@gmail.com>
Date: Thu, 12 Feb 2015 22:14:01 +0100
Subject: [PATCH] Basic 16-bit implmentation of fastNlMeansDenoising.
 Table-based exponetiation leads to high memory footprint and loss of
 precision in 16-bit mode.

---
 modules/photo/src/denoising.cpp               | 43 ++++++++++++++++---
 .../src/fast_nlmeans_denoising_invoker.hpp    | 14 +++---
 .../fast_nlmeans_multi_denoising_invoker.hpp  | 15 ++++---
 3 files changed, 55 insertions(+), 17 deletions(-)
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 724ea0eb0..0abeefe5b 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -65,17 +65,32 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
     switch (src.type()) {
         case CV_8U:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<uchar, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<uchar, int, unsigned>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<cv::Vec2b, int, unsigned>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, src.rows),
-                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned int>(
+                FastNlMeansDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<unsigned short, int64, uint64>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                    src, dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, src.rows),
+                FastNlMeansDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         default:
@@ -181,13 +196,31 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
             break;
         case CV_8UC2:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned int>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec2b, int, unsigned>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
             parallel_for_(cv::Range(0, srcImgs[0].rows),
-                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned int>(
+                FastNlMeansMultiDenoisingInvoker<cv::Vec3b, int, unsigned>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16U:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<ushort, int64, uint64>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC2:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 2>, int64, uint64>(
+                    srcImgs, imgToDenoiseIndex, temporalWindowSize,
+                    dst, templateWindowSize, searchWindowSize, h));
+            break;
+        case CV_16UC3:
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
+                FastNlMeansMultiDenoisingInvoker<cv::Vec<ushort, 3>, int64, uint64>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index 202e36013..27a016ae9 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -123,11 +123,13 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 16 bits to get a reasonable table size
+    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
+    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
+    const size_t TABLE_MAX_BITS = 24;
     CV_Assert(template_window_size_ <= 46340); // sqrt(INT_MAX)
     int template_window_size_sq = template_window_size_ * template_window_size_;
-    almost_template_window_size_sq_bin_shift_ =
-        getNearestPowerOf2(template_window_size_sq) + 2*pixelInfo<T>::sampleBits() - 16;
+    almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq) +
+        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
     double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq;
 
     IT max_dist =
@@ -139,7 +141,7 @@ FastNlMeansDenoisingInvoker<T, IT, UIT>::FastNlMeansDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
 
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
@@ -232,7 +234,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
 
             // calc weights
             IT estimation[3], weights_sum = 0;
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
 
             for (int y = 0; y < search_window_size_; y++)
@@ -250,7 +252,7 @@ void FastNlMeansDenoisingInvoker<T, IT, UIT>::operator() (const Range& range) co
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum/2) / weights_sum;
 
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 48276b426..c90249b82 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -131,12 +131,15 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
 
     // precalc weight for every possible l2 dist between blocks
     // additional optimization of precalced weights to replace division(averaging) by binary shift
-    // squared distances are truncated to 16 bits to get a reasonable table size
+    // squared distances are truncated to 24 bits to avoid unreasonable table sizes
+    // TODO: uses lots of memory and loses precision wtih 16-bit images ????
+    const size_t TABLE_MAX_BITS = 24;
     int template_window_size_sq = template_window_size_ * template_window_size_;
     almost_template_window_size_sq_bin_shift = 0;
     while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq)
         almost_template_window_size_sq_bin_shift++;
-    almost_template_window_size_sq_bin_shift += 2*pixelInfo<T>::sampleBits() - 16;
+    almost_template_window_size_sq_bin_shift +=
+        std::max(2*pixelInfo<T>::sampleBits(), TABLE_MAX_BITS) - TABLE_MAX_BITS;
 
     int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift;
     double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq;
@@ -150,7 +153,7 @@ FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::FastNlMeansMultiDenoisingInvoker(
     for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++)
     {
         double dist = almost_dist * almost_dist2actual_dist_multiplier;
-        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T))));
+        IT weight = (IT)round(fixed_point_mult_ * std::exp(-dist / (h * h * pixelInfo<T>::channels)));
 
         if (weight < WEIGHT_THRESHOLD * fixed_point_mult_)
             weight = 0;
@@ -254,7 +257,7 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
             IT weights_sum = 0;
 
             IT estimation[3];
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
                 estimation[channel_num] = 0;
 
             for (int d = 0; d < temporal_window_size_; d++)
@@ -279,8 +282,8 @@ void FastNlMeansMultiDenoisingInvoker<T, IT, UIT>::operator() (const Range& rang
                 }
             }
 
-            for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++)
-                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum; // ????
+            for (size_t channel_num = 0; channel_num < pixelInfo<T>::channels; channel_num++)
+                estimation[channel_num] = (static_cast<UIT>(estimation[channel_num]) + weights_sum / 2) / weights_sum;
 
             dst_.at<T>(i,j) = saturateCastFromArray<T, IT>(estimation);