diff --git a/modules/photo/perf/opencl/perf_denoising.cpp b/modules/photo/perf/opencl/perf_denoising.cpp index 0bdf08363..a2ee9178a 100644 --- a/modules/photo/perf/opencl/perf_denoising.cpp +++ b/modules/photo/perf/opencl/perf_denoising.cpp @@ -26,7 +26,7 @@ OCL_PERF_TEST(Photo, DenoisingGrayscale) OCL_TEST_CYCLE() cv::fastNlMeansDenoising(original, result, 10); - SANITY_CHECK(result); + SANITY_CHECK(result, 1); } OCL_PERF_TEST(Photo, DenoisingColored) @@ -42,10 +42,10 @@ OCL_PERF_TEST(Photo, DenoisingColored) OCL_TEST_CYCLE() cv::fastNlMeansDenoisingColored(original, result, 10, 10); - SANITY_CHECK(result); + SANITY_CHECK(result, 2); } -OCL_PERF_TEST(Photo, DenoisingGrayscaleMulti) +OCL_PERF_TEST(Photo, DISABLED_DenoisingGrayscaleMulti) { const int imgs_count = 3; @@ -68,7 +68,7 @@ OCL_PERF_TEST(Photo, DenoisingGrayscaleMulti) SANITY_CHECK(result); } -OCL_PERF_TEST(Photo, DenoisingColoredMulti) +OCL_PERF_TEST(Photo, DISABLED_DenoisingColoredMulti) { const int imgs_count = 3; diff --git a/modules/photo/src/arrays.hpp b/modules/photo/src/arrays.hpp index a33018e59..4aec5f7a1 100644 --- a/modules/photo/src/arrays.hpp +++ b/modules/photo/src/arrays.hpp @@ -39,10 +39,14 @@ // //M*/ +#include "opencv2/core/base.hpp" + #ifndef __OPENCV_DENOISING_ARRAYS_HPP__ #define __OPENCV_DENOISING_ARRAYS_HPP__ -template struct Array2d { +template +struct Array2d +{ T* a; int n1,n2; bool needToDeallocArray; @@ -50,14 +54,16 @@ template struct Array2d { Array2d(const Array2d& array2d): a(array2d.a), n1(array2d.n1), n2(array2d.n2), needToDeallocArray(false) { - if (array2d.needToDeallocArray) { - // copy constructor for self allocating arrays not supported - throw new std::exception(); + if (array2d.needToDeallocArray) + { + CV_Error(Error::BadDataPtr, "Copy constructor for self allocating arrays not supported"); } } Array2d(T* _a, int _n1, int _n2): - a(_a), n1(_n1), n2(_n2), needToDeallocArray(false) {} + a(_a), n1(_n1), n2(_n2), needToDeallocArray(false) + { + } Array2d(int _n1, int _n2): n1(_n1), n2(_n2), needToDeallocArray(true) @@ -65,28 +71,34 @@ template struct Array2d { a = new T[n1*n2]; } - ~Array2d() { - if (needToDeallocArray) { + ~Array2d() + { + if (needToDeallocArray) delete[] a; - } } - T* operator [] (int i) { + T* operator [] (int i) + { return a + i*n2; } - inline T* row_ptr(int i) { + inline T* row_ptr(int i) + { return (*this)[i]; } }; -template struct Array3d { +template +struct Array3d +{ T* a; int n1,n2,n3; bool needToDeallocArray; Array3d(T* _a, int _n1, int _n2, int _n3): - a(_a), n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(false) {} + a(_a), n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(false) + { + } Array3d(int _n1, int _n2, int _n3): n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(true) @@ -94,64 +106,72 @@ template struct Array3d { a = new T[n1*n2*n3]; } - ~Array3d() { - if (needToDeallocArray) { + ~Array3d() + { + if (needToDeallocArray) delete[] a; - } } - Array2d operator [] (int i) { + Array2d operator [] (int i) + { Array2d array2d(a + i*n2*n3, n2, n3); return array2d; } - inline T* row_ptr(int i1, int i2) { + inline T* row_ptr(int i1, int i2) + { return a + i1*n2*n3 + i2*n3; } }; -template struct Array4d { +template +struct Array4d +{ T* a; int n1,n2,n3,n4; bool needToDeallocArray; int steps[4]; - void init_steps() { + void init_steps() + { steps[0] = n2*n3*n4; steps[1] = n3*n4; steps[2] = n4; steps[3] = 1; } - Array4d(T* _a, int _n1, int _n2, int _n3, int _n4): + Array4d(T* _a, int _n1, int _n2, int _n3, int _n4) : a(_a), n1(_n1), n2(_n2), n3(_n3), n4(_n4), needToDeallocArray(false) - { + { init_steps(); - } + } - Array4d(int _n1, int _n2, int _n3, int _n4): + Array4d(int _n1, int _n2, int _n3, int _n4) : n1(_n1), n2(_n2), n3(_n3), n4(_n4), needToDeallocArray(true) { a = new T[n1*n2*n3*n4]; init_steps(); - } - - ~Array4d() { - if (needToDeallocArray) { - delete[] a; - } } - Array3d operator [] (int i) { + ~Array4d() + { + if (needToDeallocArray) + delete[] a; + } + + Array3d operator [] (int i) + { Array3d array3d(a + i*n2*n3*n4, n2, n3, n4); return array3d; } - inline T* row_ptr(int i1, int i2, int i3) { + inline T* row_ptr(int i1, int i2, int i3) + { return a + i1*n2*n3*n4 + i2*n3*n4 + i3*n4; } - inline int step_size(int dimension) { + inline int step_size(int dimension) + { return steps[dimension]; } }; diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp index a5532041c..98bb4fc8b 100644 --- a/modules/photo/src/denoising.cpp +++ b/modules/photo/src/denoising.cpp @@ -40,14 +40,17 @@ //M*/ #include "precomp.hpp" -#include "opencv2/photo.hpp" -#include "opencv2/imgproc.hpp" + #include "fast_nlmeans_denoising_invoker.hpp" #include "fast_nlmeans_multi_denoising_invoker.hpp" +#include "fast_nlmeans_denoising_opencl.hpp" void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h, int templateWindowSize, int searchWindowSize) { + CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()), + ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize)) + Mat src = _src.getMat(); _dst.create(src.size(), src.type()); Mat dst = _dst.getMat(); @@ -83,15 +86,20 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, float h, float hForColorComponents, int templateWindowSize, int searchWindowSize) { - Mat src = _src.getMat(); - _dst.create(src.size(), src.type()); - Mat dst = _dst.getMat(); - - if (src.type() != CV_8UC3) { + if (_src.type() != CV_8UC3) + { CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3!"); return; } + CV_OCL_RUN(_src.dims() <= 2 && (_dst.isUMat() || _src.isUMat()), + ocl_fastNlMeansDenoisingColored(_src, _dst, h, hForColorComponents, + templateWindowSize, searchWindowSize)) + + Mat src = _src.getMat(); + _dst.create(src.size(), src.type()); + Mat dst = _dst.getMat(); + Mat src_lab; cvtColor(src, src_lab, COLOR_LBGR2Lab); @@ -117,7 +125,8 @@ static void fastNlMeansDenoisingMultiCheckPreconditions( int templateWindowSize, int searchWindowSize) { int src_imgs_size = static_cast(srcImgs.size()); - if (src_imgs_size == 0) { + if (src_imgs_size == 0) + { CV_Error(Error::StsBadArg, "Input images vector should not be empty!"); } @@ -136,11 +145,11 @@ static void fastNlMeansDenoisingMultiCheckPreconditions( "should be chosen corresponding srcImgs size!"); } - for (int i = 1; i < src_imgs_size; i++) { - if (srcImgs[0].size() != srcImgs[i].size() || srcImgs[0].type() != srcImgs[i].type()) { + for (int i = 1; i < src_imgs_size; i++) + if (srcImgs[0].size() != srcImgs[i].size() || srcImgs[0].type() != srcImgs[i].type()) + { CV_Error(Error::StsBadArg, "Input images should have the same size and type!"); } - } } void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst, @@ -152,12 +161,13 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds fastNlMeansDenoisingMultiCheckPreconditions( srcImgs, imgToDenoiseIndex, - temporalWindowSize, templateWindowSize, searchWindowSize - ); + temporalWindowSize, templateWindowSize, searchWindowSize); + _dst.create(srcImgs[0].size(), srcImgs[0].type()); Mat dst = _dst.getMat(); - switch (srcImgs[0].type()) { + switch (srcImgs[0].type()) + { case CV_8U: parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker( @@ -192,15 +202,15 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr fastNlMeansDenoisingMultiCheckPreconditions( srcImgs, imgToDenoiseIndex, - temporalWindowSize, templateWindowSize, searchWindowSize - ); + temporalWindowSize, templateWindowSize, searchWindowSize); _dst.create(srcImgs[0].size(), srcImgs[0].type()); Mat dst = _dst.getMat(); int src_imgs_size = static_cast(srcImgs.size()); - if (srcImgs[0].type() != CV_8UC3) { + if (srcImgs[0].type() != CV_8UC3) + { CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3!"); return; } @@ -211,7 +221,8 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr std::vector src_lab(src_imgs_size); std::vector l(src_imgs_size); std::vector ab(src_imgs_size); - for (int i = 0; i < src_imgs_size; i++) { + for (int i = 0; i < src_imgs_size; i++) + { src_lab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC3); l[i] = Mat::zeros(srcImgs[0].size(), CV_8UC1); ab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC2); diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp index 3e9cc008b..b8f5a0392 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp @@ -51,61 +51,61 @@ using namespace cv; template -struct FastNlMeansDenoisingInvoker : ParallelLoopBody { - public: - FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst, - int template_window_size, int search_window_size, const float h); +struct FastNlMeansDenoisingInvoker : + public ParallelLoopBody +{ +public: + FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst, + int template_window_size, int search_window_size, const float h); - void operator() (const Range& range) const; + void operator() (const Range& range) const; - private: - void operator= (const FastNlMeansDenoisingInvoker&); +private: + void operator= (const FastNlMeansDenoisingInvoker&); - const Mat& src_; - Mat& dst_; + const Mat& src_; + Mat& dst_; - Mat extended_src_; - int border_size_; + Mat extended_src_; + int border_size_; - int template_window_size_; - int search_window_size_; + int template_window_size_; + int search_window_size_; - int template_window_half_size_; - int search_window_half_size_; + int template_window_half_size_; + int search_window_half_size_; - int fixed_point_mult_; - int almost_template_window_size_sq_bin_shift_; - std::vector almost_dist2weight_; + int fixed_point_mult_; + int almost_template_window_size_sq_bin_shift_; + std::vector almost_dist2weight_; - void calcDistSumsForFirstElementInRow( - int i, - Array2d& dist_sums, - Array3d& col_dist_sums, - Array3d& up_col_dist_sums) const; + void calcDistSumsForFirstElementInRow( + int i, Array2d& dist_sums, + Array3d& col_dist_sums, + Array3d& up_col_dist_sums) const; - void calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, - Array2d& dist_sums, - Array3d& col_dist_sums, - Array3d& up_col_dist_sums) const; + void calcDistSumsForElementInFirstRow( + int i, int j, int first_col_num, + Array2d& dist_sums, + Array3d& col_dist_sums, + Array3d& up_col_dist_sums) const; }; inline int getNearestPowerOf2(int value) { int p = 0; - while( 1 << p < value) ++p; + while( 1 << p < value) + ++p; return p; } template FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( - const cv::Mat& src, - cv::Mat& dst, + const Mat& src, Mat& dst, int template_window_size, int search_window_size, - const float h) : src_(src), dst_(dst) + const float h) : + src_(src), dst_(dst) { CV_Assert(src.channels() == sizeof(T)); //T is Vec1b or Vec2b or Vec3b @@ -115,26 +115,25 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( search_window_size_ = search_window_half_size_ * 2 + 1; border_size_ = search_window_half_size_ + template_window_half_size_; - copyMakeBorder(src_, extended_src_, - border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT); + copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT); const int max_estimate_sum_value = search_window_size_ * search_window_size_ * 255; fixed_point_mult_ = std::numeric_limits::max() / max_estimate_sum_value; // precalc weight for every possible l2 dist between blocks // additional optimization of precalced weights to replace division(averaging) by binary shift - - CV_Assert(template_window_size_ <= 46340 ); // sqrt(INT_MAX) + CV_Assert(template_window_size_ <= 46340); // sqrt(INT_MAX) int template_window_size_sq = template_window_size_ * template_window_size_; almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq); double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq; int max_dist = 255 * 255 * sizeof(T); - int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1); + int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1); almost_dist2weight_.resize(almost_max_dist); const double WEIGHT_THRESHOLD = 0.001; - for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) { + for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) + { double dist = almost_dist * almost_dist2actual_dist_multiplier; int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); @@ -144,50 +143,56 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( almost_dist2weight_[almost_dist] = weight; } CV_Assert(almost_dist2weight_[0] == fixed_point_mult_); - // additional optimization init end - if (dst_.empty()) { + // additional optimization init end + if (dst_.empty()) dst_ = Mat::zeros(src_.size(), src_.type()); - } } template -void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { +void FastNlMeansDenoisingInvoker::operator() (const Range& range) const +{ int row_from = range.start; int row_to = range.end - 1; + // sums of cols anf rows for current pixel p Array2d dist_sums(search_window_size_, search_window_size_); - // for lazy calc optimization + // for lazy calc optimization (sum of cols for current pixel) Array3d col_dist_sums(template_window_size_, search_window_size_, search_window_size_); int first_col_num = -1; + // last elements of column sum (for each element in row) Array3d up_col_dist_sums(src_.cols, search_window_size_, search_window_size_); - for (int i = row_from; i <= row_to; i++) { - for (int j = 0; j < src_.cols; j++) { + for (int i = row_from; i <= row_to; i++) + { + for (int j = 0; j < src_.cols; j++) + { int search_window_y = i - search_window_half_size_; int search_window_x = j - search_window_half_size_; // calc dist_sums - if (j == 0) { + if (j == 0) + { calcDistSumsForFirstElementInRow(i, dist_sums, col_dist_sums, up_col_dist_sums); first_col_num = 0; - - } else { // calc cur dist_sums using previous dist_sums - if (i == row_from) { + } + else + { + // calc cur dist_sums using previous dist_sums + if (i == row_from) + { calcDistSumsForElementInFirstRow(i, j, first_col_num, dist_sums, col_dist_sums, up_col_dist_sums); - - } else { + } + else + { int ay = border_size_ + i; int ax = border_size_ + j + template_window_half_size_; - int start_by = - border_size_ + i - search_window_half_size_; - - int start_bx = - border_size_ + j - search_window_half_size_ + template_window_half_size_; + int start_by = border_size_ + i - search_window_half_size_; + int start_bx = border_size_ + j - search_window_half_size_ + template_window_half_size_; T a_up = extended_src_.at(ay - template_window_half_size_ - 1, ax); T a_down = extended_src_.at(ay + template_window_half_size_, ax); @@ -195,33 +200,25 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { // copy class member to local variable for optimization int search_window_size = search_window_size_; - for (int y = 0; y < search_window_size; y++) { - int* dist_sums_row = dist_sums.row_ptr(y); + for (int y = 0; y < search_window_size; y++) + { + int * dist_sums_row = dist_sums.row_ptr(y); + int * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y); + int * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y); - int* col_dist_sums_row = col_dist_sums.row_ptr(first_col_num,y); + const T * b_up_ptr = extended_src_.ptr(start_by - template_window_half_size_ - 1 + y); + const T * b_down_ptr = extended_src_.ptr(start_by + template_window_half_size_ + y); - int* up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y); - - const T* b_up_ptr = - extended_src_.ptr(start_by - template_window_half_size_ - 1 + y); - - const T* b_down_ptr = - extended_src_.ptr(start_by + template_window_half_size_ + y); - - for (int x = 0; x < search_window_size; x++) { + for (int x = 0; x < search_window_size; x++) + { + // remove from current pixel sum column sum with index "first_col_num" dist_sums_row[x] -= col_dist_sums_row[x]; - col_dist_sums_row[x] = - up_col_dist_sums_row[x] + - calcUpDownDist( - a_up, a_down, - b_up_ptr[start_bx + x], b_down_ptr[start_bx + x] - ); + int bx = start_bx + x; + col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]); dist_sums_row[x] += col_dist_sums_row[x]; - up_col_dist_sums_row[x] = col_dist_sums_row[x]; - } } } @@ -230,20 +227,17 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { } // calc weights - int weights_sum = 0; - - int estimation[3]; - for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) { + int estimation[3], weights_sum = 0; + for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) estimation[channel_num] = 0; - } - for (int y = 0; y < search_window_size_; y++) { + for (int y = 0; y < search_window_size_; y++) + { const T* cur_row_ptr = extended_src_.ptr(border_size_ + search_window_y + y); int* dist_sums_row = dist_sums.row_ptr(y); - for (int x = 0; x < search_window_size_; x++) { - int almostAvgDist = - dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_; - + for (int x = 0; x < search_window_size_; x++) + { + int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_; int weight = almost_dist2weight_[almostAvgDist]; weights_sum += weight; @@ -269,18 +263,19 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForFirstElementInRow( { int j = 0; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[y][x] = 0; - for (int tx = 0; tx < template_window_size_; tx++) { + for (int tx = 0; tx < template_window_size_; tx++) col_dist_sums[tx][y][x] = 0; - } int start_y = i + y - search_window_half_size_; int start_x = j + x - search_window_half_size_; - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { - for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) { + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) + for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) + { int dist = calcDist(extended_src_, border_size_ + i + ty, border_size_ + j + tx, border_size_ + start_y + ty, border_size_ + start_x + tx); @@ -288,18 +283,14 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForFirstElementInRow( dist_sums[y][x] += dist; col_dist_sums[tx + template_window_half_size_][y][x] += dist; } - } up_col_dist_sums[j][y][x] = col_dist_sums[template_window_size_ - 1][y][x]; } - } } template inline void FastNlMeansDenoisingInvoker::calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, + int i, int j, int first_col_num, Array2d& dist_sums, Array3d& col_dist_sums, Array3d& up_col_dist_sums) const @@ -312,23 +303,20 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForElementInFirstRow( int new_last_col_num = first_col_num; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[y][x] -= col_dist_sums[first_col_num][y][x]; col_dist_sums[new_last_col_num][y][x] = 0; int by = start_by + y; int bx = start_bx + x; - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { - col_dist_sums[new_last_col_num][y][x] += - calcDist(extended_src_, ay + ty, ax, by + ty, bx); - } + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) + col_dist_sums[new_last_col_num][y][x] += calcDist(extended_src_, ay + ty, ax, by + ty, bx); dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x]; - up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x]; } - } } #endif diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp index 978f3170c..ab7db5d2d 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp @@ -46,29 +46,35 @@ using namespace cv; template static inline int calcDist(const T a, const T b); -template <> inline int calcDist(const uchar a, const uchar b) { +template <> inline int calcDist(const uchar a, const uchar b) +{ return (a-b) * (a-b); } -template <> inline int calcDist(const Vec2b a, const Vec2b b) { +template <> inline int calcDist(const Vec2b a, const Vec2b b) +{ return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]); } -template <> inline int calcDist(const Vec3b a, const Vec3b b) { +template <> inline int calcDist(const Vec3b a, const Vec3b b) +{ return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]) + (a[2]-b[2])*(a[2]-b[2]); } -template static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2) { +template static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2) +{ const T a = m.at(i1, j1); const T b = m.at(i2, j2); return calcDist(a,b); } -template static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down) { - return calcDist(a_down,b_down) - calcDist(a_up, b_up); +template static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down) +{ + return calcDist(a_down, b_down) - calcDist(a_up, b_up); } -template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar b_up, uchar b_down) { +template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar b_up, uchar b_down) +{ int A = a_down - b_down; int B = a_up - b_up; return (A-B)*(A+B); @@ -76,16 +82,37 @@ template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar b_up, uch template static inline void incWithWeight(int* estimation, int weight, T p); -template <> inline void incWithWeight(int* estimation, int weight, uchar p) { +template <> inline void incWithWeight(int* estimation, int weight, uchar p) +{ estimation[0] += weight * p; } -template <> inline void incWithWeight(int* estimation, int weight, Vec2b p) { +template <> inline void incWithWeight(int* estimation, int weight, Vec2b p) +{ estimation[0] += weight * p[0]; estimation[1] += weight * p[1]; } -template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) { +template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) +{ + estimation[0] += weight * p[0]; + estimation[1] += weight * p[1]; + estimation[2] += weight * p[2]; +} + +template <> inline void incWithWeight(int* estimation, int weight, int p) +{ + estimation[0] += weight * p; +} + +template <> inline void incWithWeight(int* estimation, int weight, Vec2i p) +{ + estimation[0] += weight * p[0]; + estimation[1] += weight * p[1]; +} + +template <> inline void incWithWeight(int* estimation, int weight, Vec3i p) +{ estimation[0] += weight * p[0]; estimation[1] += weight * p[1]; estimation[2] += weight * p[2]; @@ -93,18 +120,21 @@ template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) { template static inline T saturateCastFromArray(int* estimation); -template <> inline uchar saturateCastFromArray(int* estimation) { +template <> inline uchar saturateCastFromArray(int* estimation) +{ return saturate_cast(estimation[0]); } -template <> inline Vec2b saturateCastFromArray(int* estimation) { +template <> inline Vec2b saturateCastFromArray(int* estimation) +{ Vec2b res; res[0] = saturate_cast(estimation[0]); res[1] = saturate_cast(estimation[1]); return res; } -template <> inline Vec3b saturateCastFromArray(int* estimation) { +template <> inline Vec3b saturateCastFromArray(int* estimation) +{ Vec3b res; res[0] = saturate_cast(estimation[0]); res[1] = saturate_cast(estimation[1]); @@ -112,4 +142,20 @@ template <> inline Vec3b saturateCastFromArray(int* estimation) { return res; } +template <> inline int saturateCastFromArray(int* estimation) +{ + return estimation[0]; +} + +template <> inline Vec2i saturateCastFromArray(int* estimation) +{ + estimation[1] = 0; + return Vec2i(estimation); +} + +template <> inline Vec3i saturateCastFromArray(int* estimation) +{ + return Vec3i(estimation); +} + #endif diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp new file mode 100644 index 000000000..ad1d94254 --- /dev/null +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -0,0 +1,162 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__ +#define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__ + +#include "precomp.hpp" +#include "opencl_kernels.hpp" + +#ifdef HAVE_OPENCL + +namespace cv { + +enum +{ + BLOCK_ROWS = 32, + BLOCK_COLS = 32, + CTA_SIZE = 256 +}; + +static int divUp(int a, int b) +{ + return (a + b - 1) / b; +} + +template +static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn, + int & almostTemplateWindowSizeSqBinShift) +{ + const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255; + int fixedPointMult = std::numeric_limits::max() / maxEstimateSumValue; + int depth = DataType::depth; + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + + if (depth == CV_64F && !doubleSupport) + return false; + + // precalc weight for every possible l2 dist between blocks + // additional optimization of precalced weights to replace division(averaging) by binary shift + CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX) + int templateWindowSizeSq = templateWindowSize * templateWindowSize; + almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq); + FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; + + const FT WEIGHT_THRESHOLD = 1e-3f; + int maxDist = 255 * 255 * cn; + int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); + FT den = 1.0f / (h * h * cn); + + almostDist2Weight.create(1, almostMaxDist, CV_32SC1); + + ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc, + format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth), + doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist, + almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD); + + size_t globalsize[1] = { almostMaxDist }; + return k.run(1, globalsize, NULL, false); +} + +static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, + int templateWindowSize, int searchWindowSize) +{ + int type = _src.type(), cn = CV_MAT_CN(type); + Size size = _src.size(); + + if ( type != CV_8UC1 || type != CV_8UC2 || type != CV_8UC4 ) + return false; + + int templateWindowHalfWize = templateWindowSize / 2; + int searchWindowHalfSize = searchWindowSize / 2; + templateWindowSize = templateWindowHalfWize * 2 + 1; + searchWindowSize = searchWindowHalfSize * 2 + 1; + int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS); + int almostTemplateWindowSizeSqBinShift = -1; + + char cvt[2][40]; + String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" + " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" + " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" + " -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s", + templateWindowSize, searchWindowSize, ocl::typeToStr(type), + ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE, + templateWindowHalfWize, searchWindowHalfSize, + ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn, + CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1])); + + ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); + if (k.empty()) + return false; + + UMat almostDist2Weight; + if (!ocl_calcAlmostDist2Weight(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn, + almostTemplateWindowSizeSqBinShift)) + return false; + CV_Assert(almostTemplateWindowSizeSqBinShift >= 0); + + UMat srcex; + int borderSize = searchWindowHalfSize + templateWindowHalfWize; + copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); + + _dst.create(size, type); + UMat dst = _dst.getUMat(); + + int searchWindowSizeSq = searchWindowSize * searchWindowSize; + Size upColSumSize(size.width, searchWindowSizeSq * nblocksy); + Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy); + UMat buffer(upColSumSize + colSumSize, CV_32SC(cn)); + + srcex = srcex(Rect(Point(borderSize, borderSize), size)); + k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst), + ocl::KernelArg::PtrReadOnly(almostDist2Weight), + ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); + + size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; + return k.run(2, globalsize, localsize, false); +} + +static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, + float h, float hForColorComponents, + int templateWindowSize, int searchWindowSize) +{ + UMat src = _src.getUMat(); + _dst.create(src.size(), src.type()); + UMat dst = _dst.getUMat(); + + UMat src_lab; + cvtColor(src, src_lab, COLOR_LBGR2Lab); + + UMat l(src.size(), CV_8U); + UMat ab(src.size(), CV_8UC2); + std::vector l_ab(2), l_ab_denoised(2); + l_ab[0] = l; + l_ab[1] = ab; + l_ab_denoised[0].create(src.size(), CV_8U); + l_ab_denoised[1].create(src.size(), CV_8UC2); + + int from_to[] = { 0,0, 1,1, 2,2 }; + mixChannels(std::vector(1, src_lab), l_ab, from_to, 3); + + fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize); + fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize); + + UMat dst_lab(src.size(), src.type()); + mixChannels(l_ab_denoised, std::vector(1, dst_lab), from_to, 3); + + cvtColor(dst_lab, dst, COLOR_Lab2LBGR); + return true; +} + +} + +#endif +#endif diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp index e2351a23c..191a67127 100644 --- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp @@ -51,51 +51,47 @@ using namespace cv; template -struct FastNlMeansMultiDenoisingInvoker : ParallelLoopBody { - public: - FastNlMeansMultiDenoisingInvoker( - const std::vector& srcImgs, int imgToDenoiseIndex, int temporalWindowSize, - Mat& dst, int template_window_size, int search_window_size, const float h); +struct FastNlMeansMultiDenoisingInvoker : + ParallelLoopBody +{ +public: + FastNlMeansMultiDenoisingInvoker(const std::vector& srcImgs, int imgToDenoiseIndex, + int temporalWindowSize, Mat& dst, int template_window_size, + int search_window_size, const float h); - void operator() (const Range& range) const; + void operator() (const Range& range) const; - private: - void operator= (const FastNlMeansMultiDenoisingInvoker&); +private: + void operator= (const FastNlMeansMultiDenoisingInvoker&); - int rows_; - int cols_; + int rows_; + int cols_; - Mat& dst_; + Mat& dst_; - std::vector extended_srcs_; - Mat main_extended_src_; - int border_size_; + std::vector extended_srcs_; + Mat main_extended_src_; + int border_size_; - int template_window_size_; - int search_window_size_; - int temporal_window_size_; + int template_window_size_; + int search_window_size_; + int temporal_window_size_; - int template_window_half_size_; - int search_window_half_size_; - int temporal_window_half_size_; + int template_window_half_size_; + int search_window_half_size_; + int temporal_window_half_size_; - int fixed_point_mult_; - int almost_template_window_size_sq_bin_shift; - std::vector almost_dist2weight; + int fixed_point_mult_; + int almost_template_window_size_sq_bin_shift; + std::vector almost_dist2weight; - void calcDistSumsForFirstElementInRow( - int i, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const; + void calcDistSumsForFirstElementInRow(int i, Array3d& dist_sums, + Array4d& col_dist_sums, + Array4d& up_col_dist_sums) const; - void calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const; + void calcDistSumsForElementInFirstRow(int i, int j, int first_col_num, + Array3d& dist_sums, Array4d& col_dist_sums, + Array4d& up_col_dist_sums) const; }; template @@ -106,7 +102,8 @@ FastNlMeansMultiDenoisingInvoker::FastNlMeansMultiDenoisingInvoker( cv::Mat& dst, int template_window_size, int search_window_size, - const float h) : dst_(dst), extended_srcs_(srcImgs.size()) + const float h) : + dst_(dst), extended_srcs_(srcImgs.size()) { CV_Assert(srcImgs.size() > 0); CV_Assert(srcImgs[0].channels() == sizeof(T)); @@ -123,85 +120,84 @@ FastNlMeansMultiDenoisingInvoker::FastNlMeansMultiDenoisingInvoker( temporal_window_size_ = temporal_window_half_size_ * 2 + 1; border_size_ = search_window_half_size_ + template_window_half_size_; - for (int i = 0; i < temporal_window_size_; i++) { - copyMakeBorder( - srcImgs[imgToDenoiseIndex - temporal_window_half_size_ + i], extended_srcs_[i], + for (int i = 0; i < temporal_window_size_; i++) + copyMakeBorder(srcImgs[imgToDenoiseIndex - temporal_window_half_size_ + i], extended_srcs_[i], border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT); - } + main_extended_src_ = extended_srcs_[temporal_window_half_size_]; - - const int max_estimate_sum_value = - temporal_window_size_ * search_window_size_ * search_window_size_ * 255; - + const int max_estimate_sum_value = temporal_window_size_ * search_window_size_ * search_window_size_ * 255; fixed_point_mult_ = std::numeric_limits::max() / max_estimate_sum_value; // precalc weight for every possible l2 dist between blocks // additional optimization of precalced weights to replace division(averaging) by binary shift int template_window_size_sq = template_window_size_ * template_window_size_; almost_template_window_size_sq_bin_shift = 0; - while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq) { + while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq) almost_template_window_size_sq_bin_shift++; - } int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift; - double almost_dist2actual_dist_multiplier = - ((double) almost_template_window_size_sq) / template_window_size_sq; + double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq; int max_dist = 255 * 255 * sizeof(T); int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1); almost_dist2weight.resize(almost_max_dist); const double WEIGHT_THRESHOLD = 0.001; - for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) { + for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) + { double dist = almost_dist * almost_dist2actual_dist_multiplier; int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); - if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) { + if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) weight = 0; - } almost_dist2weight[almost_dist] = weight; } CV_Assert(almost_dist2weight[0] == fixed_point_mult_); - // additional optimization init end - if (dst_.empty()) { + // additional optimization init end + if (dst_.empty()) dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type()); - } } template -void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const { +void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const +{ int row_from = range.start; int row_to = range.end - 1; Array3d dist_sums(temporal_window_size_, search_window_size_, search_window_size_); // for lazy calc optimization - Array4d col_dist_sums( - template_window_size_, temporal_window_size_, search_window_size_, search_window_size_); + Array4d col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_); int first_col_num = -1; + Array4d up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_); - Array4d up_col_dist_sums( - cols_, temporal_window_size_, search_window_size_, search_window_size_); - - for (int i = row_from; i <= row_to; i++) { - for (int j = 0; j < cols_; j++) { + for (int i = row_from; i <= row_to; i++) + { + for (int j = 0; j < cols_; j++) + { int search_window_y = i - search_window_half_size_; int search_window_x = j - search_window_half_size_; // calc dist_sums - if (j == 0) { + if (j == 0) + { calcDistSumsForFirstElementInRow(i, dist_sums, col_dist_sums, up_col_dist_sums); first_col_num = 0; - - } else { // calc cur dist_sums using previous dist_sums - if (i == row_from) { + } + else + { + // calc cur dist_sums using previous dist_sums + if (i == row_from) + { calcDistSumsForElementInFirstRow(i, j, first_col_num, dist_sums, col_dist_sums, up_col_dist_sums); - } else { + } + else + { int ay = border_size_ + i; int ax = border_size_ + j + template_window_half_size_; @@ -217,36 +213,31 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const // copy class member to local variable for optimization int search_window_size = search_window_size_; - for (int d = 0; d < temporal_window_size_; d++) { + for (int d = 0; d < temporal_window_size_; d++) + { Mat cur_extended_src = extended_srcs_[d]; Array2d cur_dist_sums = dist_sums[d]; Array2d cur_col_dist_sums = col_dist_sums[first_col_num][d]; Array2d cur_up_col_dist_sums = up_col_dist_sums[j][d]; - for (int y = 0; y < search_window_size; y++) { + for (int y = 0; y < search_window_size; y++) + { int* dist_sums_row = cur_dist_sums.row_ptr(y); int* col_dist_sums_row = cur_col_dist_sums.row_ptr(y); - int* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y); - const T* b_up_ptr = - cur_extended_src.ptr(start_by - template_window_half_size_ - 1 + y); - const T* b_down_ptr = - cur_extended_src.ptr(start_by + template_window_half_size_ + y); + const T* b_up_ptr = cur_extended_src.ptr(start_by - template_window_half_size_ - 1 + y); + const T* b_down_ptr = cur_extended_src.ptr(start_by + template_window_half_size_ + y); - for (int x = 0; x < search_window_size; x++) { + for (int x = 0; x < search_window_size; x++) + { dist_sums_row[x] -= col_dist_sums_row[x]; col_dist_sums_row[x] = up_col_dist_sums_row[x] + - calcUpDownDist( - a_up, a_down, - b_up_ptr[start_bx + x], b_down_ptr[start_bx + x] - ); + calcUpDownDist(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]); dist_sums_row[x] += col_dist_sums_row[x]; - up_col_dist_sums_row[x] = col_dist_sums_row[x]; - } } } @@ -259,19 +250,21 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const int weights_sum = 0; int estimation[3]; - for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) { + for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) estimation[channel_num] = 0; - } - for (int d = 0; d < temporal_window_size_; d++) { + + for (int d = 0; d < temporal_window_size_; d++) + { const Mat& esrc_d = extended_srcs_[d]; - for (int y = 0; y < search_window_size_; y++) { + for (int y = 0; y < search_window_size_; y++) + { const T* cur_row_ptr = esrc_d.ptr(border_size_ + search_window_y + y); int* dist_sums_row = dist_sums.row_ptr(d, y); - for (int x = 0; x < search_window_size_; x++) { - int almostAvgDist = - dist_sums_row[x] >> almost_template_window_size_sq_bin_shift; + for (int x = 0; x < search_window_size_; x++) + { + int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift; int weight = almost_dist2weight[almostAvgDist]; weights_sum += weight; @@ -293,21 +286,19 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const template inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForFirstElementInRow( - int i, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const + int i, Array3d& dist_sums, Array4d& col_dist_sums, Array4d& up_col_dist_sums) const { int j = 0; - for (int d = 0; d < temporal_window_size_; d++) { + for (int d = 0; d < temporal_window_size_; d++) + { Mat cur_extended_src = extended_srcs_[d]; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[d][y][x] = 0; - for (int tx = 0; tx < template_window_size_; tx++) { + for (int tx = 0; tx < template_window_size_; tx++) col_dist_sums[tx][d][y][x] = 0; - } int start_y = i + y - search_window_half_size_; int start_x = j + x - search_window_half_size_; @@ -315,14 +306,13 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForFirstElementInRo int* dist_sums_ptr = &dist_sums[d][y][x]; int* col_dist_sums_ptr = &col_dist_sums[0][d][y][x]; int col_dist_sums_step = col_dist_sums.step_size(0); - for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) { - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { + for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) + { + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) + { int dist = calcDist( - main_extended_src_.at( - border_size_ + i + ty, border_size_ + j + tx), - cur_extended_src.at( - border_size_ + start_y + ty, border_size_ + start_x + tx) - ); + main_extended_src_.at(border_size_ + i + ty, border_size_ + j + tx), + cur_extended_src.at(border_size_ + start_y + ty, border_size_ + start_x + tx)); *dist_sums_ptr += dist; *col_dist_sums_ptr += dist; @@ -332,18 +322,13 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForFirstElementInRo up_col_dist_sums[j][d][y][x] = col_dist_sums[template_window_size_ - 1][d][y][x]; } - } } } template inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const + int i, int j, int first_col_num, Array3d& dist_sums, + Array4d& col_dist_sums, Array4d& up_col_dist_sums) const { int ay = border_size_ + i; int ax = border_size_ + j + template_window_half_size_; @@ -353,10 +338,12 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForElementInFirstRo int new_last_col_num = first_col_num; - for (int d = 0; d < temporal_window_size_; d++) { + for (int d = 0; d < temporal_window_size_; d++) + { Mat cur_extended_src = extended_srcs_[d]; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[d][y][x] -= col_dist_sums[first_col_num][d][y][x]; col_dist_sums[new_last_col_num][d][y][x] = 0; @@ -364,19 +351,17 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForElementInFirstRo int bx = start_bx + x; int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x]; - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { - *col_dist_sums_ptr += - calcDist( - main_extended_src_.at(ay + ty, ax), - cur_extended_src.at(by + ty, bx) - ); + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) + { + *col_dist_sums_ptr += calcDist( + main_extended_src_.at(ay + ty, ax), + cur_extended_src.at(by + ty, bx)); } dist_sums[d][y][x] += col_dist_sums[new_last_col_num][d][y][x]; up_col_dist_sums[j][d][y][x] = col_dist_sums[new_last_col_num][d][y][x]; } - } } } diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl new file mode 100644 index 000000000..67a6ec61f --- /dev/null +++ b/modules/photo/src/opencl/nlmeans.cl @@ -0,0 +1,291 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifdef cl_amd_printf +#pragma OPENCL_EXTENSION cl_amd_printf:enable +#endif + +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + + +#ifdef OP_CALC_WEIGHTS + +__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, + FT almostDist2ActualDistMultiplier, int fixedPointMult, + FT den, FT WEIGHT_THRESHOLD) +{ + int almostDist = get_global_id(0); + + if (almostDist < almostMaxDist) + { + FT dist = almostDist * almostDist2ActualDistMultiplier; + int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); + + if (weight < WEIGHT_THRESHOLD * fixedPointMult) + weight = 0; + + almostDist2Weight[almostDist] = weight; + } +} + +#elif defined OP_CALC_FASTNLMEANS + +#define noconvert + +#define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE) + +inline int calcDist(uchar_t a, uchar_t b) +{ + int_t diff = convert_int_t(a) - convert_int_t(b); + int_t retval = diff * diff; + +#if cn == 1 + return retval; +#elif cn == 2 + return retval.x + retval.y; +#else +#error "cn should be either 1 or 2" +#endif +} + +inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t) +{ + int_t A = convert_int_t(down_value) - convert_int_t(down_value_t); + int_t B = convert_int_t(up_value) - convert_int_t(up_value_t); + int_t retval = (A - B) * (A + B); + +#if cn == 1 + return retval; +#elif cn == 2 + return retval.x + retval.y; +#else +#error "cn should be either 1 or 2" +#endif +} + +#define COND if (x == 0 && y == 0) + +inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset, + __local int * dists, int y, int x, int id, + __global int * col_dists, __global int * up_col_dists) +{ + y -= TEMPLATE_SIZE2; + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + int col_dists_current_private[TEMPLATE_SIZE]; + + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + { + int dist = 0, value; + + __global const uchar_t * src_template = (__global const uchar_t *)(src + + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); + __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); + __global int * col_dists_current = col_dists + i * TEMPLATE_SIZE; + + #pragma unroll + for (int j = 0; j < TEMPLATE_SIZE; ++j) + col_dists_current_private[j] = 0; + + for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) + { + #pragma unroll + for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx) + { + value = calcDist(src_template[tx], src_current[tx]); + + col_dists_current_private[tx + TEMPLATE_SIZE2] += value; + dist += value; + } + + src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); + src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); + } + + #pragma unroll + for (int j = 0; j < TEMPLATE_SIZE; ++j) + col_dists_current[j] = col_dists_current_private[j]; + + dists[i] = dist; + up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1]; + } +} + +inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, + __local int * dists, int y, int x0, int x, int id, int first, + __global int * col_dists, __global int * up_col_dists) +{ + x += TEMPLATE_SIZE2; + y -= TEMPLATE_SIZE2; + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + { + __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); + __global const uchar_t * src_template = (__global const uchar_t *)(src + + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); + __global int * col_dists_current = col_dists + TEMPLATE_SIZE * i; + + int col_dist = 0; + + #pragma unroll + for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) + { + col_dist += calcDist(src_current[0], src_template[0]); + + src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); + src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); + } + + dists[i] += col_dist - col_dists_current[first]; + col_dists_current[first] = col_dist; + up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist; + } +} + +inline void calcElement(__global const uchar * src, int src_step, int src_offset, + __local int * dists, int y, int x0, int x, int id, int first, + __global int * col_dists, __global int * up_col_dists) +{ + int sx = x + TEMPLATE_SIZE2; + int sy_up = y - TEMPLATE_SIZE2 - 1; + int sy_down = y + TEMPLATE_SIZE2; + + uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset))); + uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset))); + + sx -= SEARCH_SIZE2; + sy_up -= SEARCH_SIZE2; + sy_down -= SEARCH_SIZE2; + + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + { + int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE; + + uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset))); + uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset))); + + __global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); + __global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); + + int col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t); + + dists[i] += col_dist - col_dists_current[0]; + col_dists_current[0] = col_dist; + up_col_dists_current[0] = col_dist; + } +} + +inline void convolveWindow(__global const uchar * src, int src_step, int src_offset, + __local int * dists, __global const int * almostDist2Weight, + __global uchar * dst, int dst_step, int dst_offset, + int y, int x, int id, __local int * weights_local, + __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) +{ + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0; + int_t weighted_sum = (int_t)(0); + + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + { + int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset)); + int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index)); + + int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; + int weight = almostDist2Weight[almostAvgDist]; + + weights += weight; + weighted_sum += (int_t)(weight) * src_value; + } + + if (id >= CTA_SIZE2) + { + int id2 = id - CTA_SIZE2; + weights_local[id2] = weights; + weighted_sum_local[id2] = weighted_sum; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (id < CTA_SIZE2) + { + weights_local[id] += weights; + weighted_sum_local[id] += weighted_sum; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = CTA_SIZE2 >> 1; lsize > 2; lsize >>= 1) + { + if (id < lsize) + { + int id2 = lsize + id; + weights_local[id] += weights_local[id2]; + weighted_sum_local[id] += weighted_sum_local[id2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (id == 0) + { + int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset)); + int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + + weighted_sum_local[2] + weighted_sum_local[3]; + int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; + + *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / (int_t)(weights_local_0)); + } +} + +__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __global const int * almostDist2Weight, __global uchar * buffer, + int almostTemplateWindowSizeSqBinShift) +{ + int block_x = get_group_id(0), nblocks_x = get_num_groups(0); + int block_y = get_group_id(1); + int id = get_local_id(0), first; + + __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE2]; + __local int_t weighted_sum[CTA_SIZE2]; + + int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); + int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); + + // for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element + // and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row + int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE); + __global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int)); + __global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE; + + for (int y = y0; y < y1; ++y) + for (int x = x0; x < x1; ++x) + { + if (x == x0) + { + calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); + first = 0; + } + else + { + if (y == y0) + calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); + else + calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); + + first = (first + 1) % TEMPLATE_SIZE; + } + barrier(CLK_LOCAL_MEM_FENCE); + + convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, + y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift); + } +} + +#endif diff --git a/modules/photo/src/precomp.hpp b/modules/photo/src/precomp.hpp index 38ac3ffcd..4355f13c0 100644 --- a/modules/photo/src/precomp.hpp +++ b/modules/photo/src/precomp.hpp @@ -46,6 +46,8 @@ #include "opencv2/core/private.hpp" #include "opencv2/core/utility.hpp" #include "opencv2/photo.hpp" +#include "opencv2/core/ocl.hpp" +#include "opencv2/imgproc.hpp" #ifdef HAVE_TEGRA_OPTIMIZATION #include "opencv2/photo/photo_tegra.hpp" diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp new file mode 100644 index 000000000..745a45745 --- /dev/null +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -0,0 +1,95 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#include "test_precomp.hpp" +#include "opencv2/ts/ocl_test.hpp" + +#ifdef HAVE_OPENCL + +namespace cvtest { +namespace ocl { + +PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool) +{ + int cn, templateWindowSize, searchWindowSize; + float h; + bool use_roi; + + TEST_DECLARE_INPUT_PARAMETER(src) + TEST_DECLARE_OUTPUT_PARAMETER(dst) + + virtual void SetUp() + { + cn = GET_PARAM(0); + use_roi = GET_PARAM(1); + + templateWindowSize = 7; + searchWindowSize = 21; + h = 3.0f; + } + + virtual void generateTestData() + { + Mat image; + if (cn == 1) + { + image = readImage("denoising/lena_noised_gaussian_sigma=10.png", IMREAD_GRAYSCALE); + ASSERT_FALSE(image.empty()); + } + + const int type = CV_8UC(cn); + + Size roiSize = cn == 1 ? image.size() : randomSize(1, MAX_VALUE); + Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); + randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255); + if (cn == 1) + image.copyTo(src_roi); + + Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); + randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255); + + UMAT_UPLOAD_INPUT_PARAMETER(src) + UMAT_UPLOAD_OUTPUT_PARAMETER(dst) + } +}; + +typedef FastNlMeansDenoisingTestBase FastNlMeansDenoising; + +OCL_TEST_P(FastNlMeansDenoising, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + generateTestData(); + + OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize)); + OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize)); + + OCL_EXPECT_MATS_NEAR(dst, 1) + } +} + +typedef FastNlMeansDenoisingTestBase fastNlMeansDenoisingColored; + +OCL_TEST_P(fastNlMeansDenoisingColored, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + generateTestData(); + + OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize)); + OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize)); + + OCL_EXPECT_MATS_NEAR(dst, 1) + } +} + +OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool())); +OCL_INSTANTIATE_TEST_CASE_P(Photo, fastNlMeansDenoisingColored, Combine(Values(Channels(3)), Bool())); + +} } // namespace cvtest::ocl + +#endif // HAVE_OPENCL