From e16d89e8d68895d5899b7fded655b98185ceeb54 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 11 Feb 2014 19:08:13 +0400 Subject: [PATCH 01/11] some refactoring --- .../src/fast_nlmeans_denoising_invoker.hpp | 155 ++++++------ ...fast_nlmeans_denoising_invoker_commons.hpp | 37 ++- .../fast_nlmeans_multi_denoising_invoker.hpp | 227 ++++++++---------- 3 files changed, 210 insertions(+), 209 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp index 3e9cc008b..b274c6889 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp @@ -51,61 +51,61 @@ using namespace cv; template -struct FastNlMeansDenoisingInvoker : ParallelLoopBody { - public: - FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst, - int template_window_size, int search_window_size, const float h); +struct FastNlMeansDenoisingInvoker : + public ParallelLoopBody +{ +public: + FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst, + int template_window_size, int search_window_size, const float h); - void operator() (const Range& range) const; + void operator() (const Range& range) const; - private: - void operator= (const FastNlMeansDenoisingInvoker&); +private: + void operator= (const FastNlMeansDenoisingInvoker&); - const Mat& src_; - Mat& dst_; + const Mat& src_; + Mat& dst_; - Mat extended_src_; - int border_size_; + Mat extended_src_; + int border_size_; - int template_window_size_; - int search_window_size_; + int template_window_size_; + int search_window_size_; - int template_window_half_size_; - int search_window_half_size_; + int template_window_half_size_; + int search_window_half_size_; - int fixed_point_mult_; - int almost_template_window_size_sq_bin_shift_; - std::vector almost_dist2weight_; + int fixed_point_mult_; + int almost_template_window_size_sq_bin_shift_; + std::vector almost_dist2weight_; - void calcDistSumsForFirstElementInRow( - int i, - Array2d& dist_sums, - Array3d& col_dist_sums, - Array3d& up_col_dist_sums) const; + void calcDistSumsForFirstElementInRow( + int i, Array2d& dist_sums, + Array3d& col_dist_sums, + Array3d& up_col_dist_sums) const; - void calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, - Array2d& dist_sums, - Array3d& col_dist_sums, - Array3d& up_col_dist_sums) const; + void calcDistSumsForElementInFirstRow( + int i, int j, int first_col_num, + Array2d& dist_sums, + Array3d& col_dist_sums, + Array3d& up_col_dist_sums) const; }; inline int getNearestPowerOf2(int value) { int p = 0; - while( 1 << p < value) ++p; + while( 1 << p < value) + ++p; return p; } template FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( - const cv::Mat& src, - cv::Mat& dst, + const cv::Mat& src, cv::Mat& dst, int template_window_size, int search_window_size, - const float h) : src_(src), dst_(dst) + const float h) : + src_(src), dst_(dst) { CV_Assert(src.channels() == sizeof(T)); //T is Vec1b or Vec2b or Vec3b @@ -134,7 +134,8 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( almost_dist2weight_.resize(almost_max_dist); const double WEIGHT_THRESHOLD = 0.001; - for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) { + for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) + { double dist = almost_dist * almost_dist2actual_dist_multiplier; int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); @@ -144,15 +145,15 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( almost_dist2weight_[almost_dist] = weight; } CV_Assert(almost_dist2weight_[0] == fixed_point_mult_); - // additional optimization init end - if (dst_.empty()) { + // additional optimization init end + if (dst_.empty()) dst_ = Mat::zeros(src_.size(), src_.type()); - } } template -void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { +void FastNlMeansDenoisingInvoker::operator() (const Range& range) const +{ int row_from = range.start; int row_to = range.end - 1; @@ -164,30 +165,36 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { int first_col_num = -1; Array3d up_col_dist_sums(src_.cols, search_window_size_, search_window_size_); - for (int i = row_from; i <= row_to; i++) { - for (int j = 0; j < src_.cols; j++) { + for (int i = row_from; i <= row_to; i++) + { + for (int j = 0; j < src_.cols; j++) + { int search_window_y = i - search_window_half_size_; int search_window_x = j - search_window_half_size_; // calc dist_sums - if (j == 0) { + if (j == 0) + { calcDistSumsForFirstElementInRow(i, dist_sums, col_dist_sums, up_col_dist_sums); first_col_num = 0; - } else { // calc cur dist_sums using previous dist_sums - if (i == row_from) { + } + else + { + // calc cur dist_sums using previous dist_sums + if (i == row_from) + { calcDistSumsForElementInFirstRow(i, j, first_col_num, dist_sums, col_dist_sums, up_col_dist_sums); - } else { + } + else + { int ay = border_size_ + i; int ax = border_size_ + j + template_window_half_size_; - int start_by = - border_size_ + i - search_window_half_size_; - - int start_bx = - border_size_ + j - search_window_half_size_ + template_window_half_size_; + int start_by = border_size_ + i - search_window_half_size_; + int start_bx = border_size_ + j - search_window_half_size_ + template_window_half_size_; T a_up = extended_src_.at(ay - template_window_half_size_ - 1, ax); T a_down = extended_src_.at(ay + template_window_half_size_, ax); @@ -195,20 +202,18 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { // copy class member to local variable for optimization int search_window_size = search_window_size_; - for (int y = 0; y < search_window_size; y++) { + for (int y = 0; y < search_window_size; y++) + { int* dist_sums_row = dist_sums.row_ptr(y); int* col_dist_sums_row = col_dist_sums.row_ptr(first_col_num,y); - int* up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y); - const T* b_up_ptr = - extended_src_.ptr(start_by - template_window_half_size_ - 1 + y); + const T* b_up_ptr = extended_src_.ptr(start_by - template_window_half_size_ - 1 + y); + const T* b_down_ptr = extended_src_.ptr(start_by + template_window_half_size_ + y); - const T* b_down_ptr = - extended_src_.ptr(start_by + template_window_half_size_ + y); - - for (int x = 0; x < search_window_size; x++) { + for (int x = 0; x < search_window_size; x++) + { dist_sums_row[x] -= col_dist_sums_row[x]; col_dist_sums_row[x] = @@ -233,14 +238,15 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { int weights_sum = 0; int estimation[3]; - for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) { + for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) estimation[channel_num] = 0; - } - for (int y = 0; y < search_window_size_; y++) { + for (int y = 0; y < search_window_size_; y++) + { const T* cur_row_ptr = extended_src_.ptr(border_size_ + search_window_y + y); int* dist_sums_row = dist_sums.row_ptr(y); - for (int x = 0; x < search_window_size_; x++) { + for (int x = 0; x < search_window_size_; x++) + { int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_; @@ -269,18 +275,19 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForFirstElementInRow( { int j = 0; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[y][x] = 0; - for (int tx = 0; tx < template_window_size_; tx++) { + for (int tx = 0; tx < template_window_size_; tx++) col_dist_sums[tx][y][x] = 0; - } int start_y = i + y - search_window_half_size_; int start_x = j + x - search_window_half_size_; - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { - for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) { + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) + for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) + { int dist = calcDist(extended_src_, border_size_ + i + ty, border_size_ + j + tx, border_size_ + start_y + ty, border_size_ + start_x + tx); @@ -288,11 +295,9 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForFirstElementInRow( dist_sums[y][x] += dist; col_dist_sums[tx + template_window_half_size_][y][x] += dist; } - } up_col_dist_sums[j][y][x] = col_dist_sums[template_window_size_ - 1][y][x]; } - } } template @@ -312,23 +317,21 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForElementInFirstRow( int new_last_col_num = first_col_num; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[y][x] -= col_dist_sums[first_col_num][y][x]; col_dist_sums[new_last_col_num][y][x] = 0; int by = start_by + y; int bx = start_bx + x; - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) col_dist_sums[new_last_col_num][y][x] += calcDist(extended_src_, ay + ty, ax, by + ty, bx); - } dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x]; - up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x]; } - } } #endif diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp index 978f3170c..3e8f4c498 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp @@ -46,29 +46,35 @@ using namespace cv; template static inline int calcDist(const T a, const T b); -template <> inline int calcDist(const uchar a, const uchar b) { +template <> inline int calcDist(const uchar a, const uchar b) +{ return (a-b) * (a-b); } -template <> inline int calcDist(const Vec2b a, const Vec2b b) { +template <> inline int calcDist(const Vec2b a, const Vec2b b) +{ return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]); } -template <> inline int calcDist(const Vec3b a, const Vec3b b) { +template <> inline int calcDist(const Vec3b a, const Vec3b b) +{ return (a[0]-b[0])*(a[0]-b[0]) + (a[1]-b[1])*(a[1]-b[1]) + (a[2]-b[2])*(a[2]-b[2]); } -template static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2) { +template static inline int calcDist(const Mat& m, int i1, int j1, int i2, int j2) +{ const T a = m.at(i1, j1); const T b = m.at(i2, j2); return calcDist(a,b); } -template static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down) { +template static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down) +{ return calcDist(a_down,b_down) - calcDist(a_up, b_up); } -template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar b_up, uchar b_down) { +template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar b_up, uchar b_down) +{ int A = a_down - b_down; int B = a_up - b_up; return (A-B)*(A+B); @@ -76,16 +82,20 @@ template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar b_up, uch template static inline void incWithWeight(int* estimation, int weight, T p); -template <> inline void incWithWeight(int* estimation, int weight, uchar p) { +template <> inline void incWithWeight(int* estimation, int weight, uchar p) +{ + estimation[0] += weight * p; } -template <> inline void incWithWeight(int* estimation, int weight, Vec2b p) { +template <> inline void incWithWeight(int* estimation, int weight, Vec2b p) +{ estimation[0] += weight * p[0]; estimation[1] += weight * p[1]; } -template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) { +template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) +{ estimation[0] += weight * p[0]; estimation[1] += weight * p[1]; estimation[2] += weight * p[2]; @@ -93,18 +103,21 @@ template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) { template static inline T saturateCastFromArray(int* estimation); -template <> inline uchar saturateCastFromArray(int* estimation) { +template <> inline uchar saturateCastFromArray(int* estimation) +{ return saturate_cast(estimation[0]); } -template <> inline Vec2b saturateCastFromArray(int* estimation) { +template <> inline Vec2b saturateCastFromArray(int* estimation) +{ Vec2b res; res[0] = saturate_cast(estimation[0]); res[1] = saturate_cast(estimation[1]); return res; } -template <> inline Vec3b saturateCastFromArray(int* estimation) { +template <> inline Vec3b saturateCastFromArray(int* estimation) +{ Vec3b res; res[0] = saturate_cast(estimation[0]); res[1] = saturate_cast(estimation[1]); diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp index e2351a23c..191a67127 100644 --- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp @@ -51,51 +51,47 @@ using namespace cv; template -struct FastNlMeansMultiDenoisingInvoker : ParallelLoopBody { - public: - FastNlMeansMultiDenoisingInvoker( - const std::vector& srcImgs, int imgToDenoiseIndex, int temporalWindowSize, - Mat& dst, int template_window_size, int search_window_size, const float h); +struct FastNlMeansMultiDenoisingInvoker : + ParallelLoopBody +{ +public: + FastNlMeansMultiDenoisingInvoker(const std::vector& srcImgs, int imgToDenoiseIndex, + int temporalWindowSize, Mat& dst, int template_window_size, + int search_window_size, const float h); - void operator() (const Range& range) const; + void operator() (const Range& range) const; - private: - void operator= (const FastNlMeansMultiDenoisingInvoker&); +private: + void operator= (const FastNlMeansMultiDenoisingInvoker&); - int rows_; - int cols_; + int rows_; + int cols_; - Mat& dst_; + Mat& dst_; - std::vector extended_srcs_; - Mat main_extended_src_; - int border_size_; + std::vector extended_srcs_; + Mat main_extended_src_; + int border_size_; - int template_window_size_; - int search_window_size_; - int temporal_window_size_; + int template_window_size_; + int search_window_size_; + int temporal_window_size_; - int template_window_half_size_; - int search_window_half_size_; - int temporal_window_half_size_; + int template_window_half_size_; + int search_window_half_size_; + int temporal_window_half_size_; - int fixed_point_mult_; - int almost_template_window_size_sq_bin_shift; - std::vector almost_dist2weight; + int fixed_point_mult_; + int almost_template_window_size_sq_bin_shift; + std::vector almost_dist2weight; - void calcDistSumsForFirstElementInRow( - int i, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const; + void calcDistSumsForFirstElementInRow(int i, Array3d& dist_sums, + Array4d& col_dist_sums, + Array4d& up_col_dist_sums) const; - void calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const; + void calcDistSumsForElementInFirstRow(int i, int j, int first_col_num, + Array3d& dist_sums, Array4d& col_dist_sums, + Array4d& up_col_dist_sums) const; }; template @@ -106,7 +102,8 @@ FastNlMeansMultiDenoisingInvoker::FastNlMeansMultiDenoisingInvoker( cv::Mat& dst, int template_window_size, int search_window_size, - const float h) : dst_(dst), extended_srcs_(srcImgs.size()) + const float h) : + dst_(dst), extended_srcs_(srcImgs.size()) { CV_Assert(srcImgs.size() > 0); CV_Assert(srcImgs[0].channels() == sizeof(T)); @@ -123,85 +120,84 @@ FastNlMeansMultiDenoisingInvoker::FastNlMeansMultiDenoisingInvoker( temporal_window_size_ = temporal_window_half_size_ * 2 + 1; border_size_ = search_window_half_size_ + template_window_half_size_; - for (int i = 0; i < temporal_window_size_; i++) { - copyMakeBorder( - srcImgs[imgToDenoiseIndex - temporal_window_half_size_ + i], extended_srcs_[i], + for (int i = 0; i < temporal_window_size_; i++) + copyMakeBorder(srcImgs[imgToDenoiseIndex - temporal_window_half_size_ + i], extended_srcs_[i], border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT); - } + main_extended_src_ = extended_srcs_[temporal_window_half_size_]; - - const int max_estimate_sum_value = - temporal_window_size_ * search_window_size_ * search_window_size_ * 255; - + const int max_estimate_sum_value = temporal_window_size_ * search_window_size_ * search_window_size_ * 255; fixed_point_mult_ = std::numeric_limits::max() / max_estimate_sum_value; // precalc weight for every possible l2 dist between blocks // additional optimization of precalced weights to replace division(averaging) by binary shift int template_window_size_sq = template_window_size_ * template_window_size_; almost_template_window_size_sq_bin_shift = 0; - while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq) { + while (1 << almost_template_window_size_sq_bin_shift < template_window_size_sq) almost_template_window_size_sq_bin_shift++; - } int almost_template_window_size_sq = 1 << almost_template_window_size_sq_bin_shift; - double almost_dist2actual_dist_multiplier = - ((double) almost_template_window_size_sq) / template_window_size_sq; + double almost_dist2actual_dist_multiplier = (double) almost_template_window_size_sq / template_window_size_sq; int max_dist = 255 * 255 * sizeof(T); int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1); almost_dist2weight.resize(almost_max_dist); const double WEIGHT_THRESHOLD = 0.001; - for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) { + for (int almost_dist = 0; almost_dist < almost_max_dist; almost_dist++) + { double dist = almost_dist * almost_dist2actual_dist_multiplier; int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); - if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) { + if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) weight = 0; - } almost_dist2weight[almost_dist] = weight; } CV_Assert(almost_dist2weight[0] == fixed_point_mult_); - // additional optimization init end - if (dst_.empty()) { + // additional optimization init end + if (dst_.empty()) dst_ = Mat::zeros(srcImgs[0].size(), srcImgs[0].type()); - } } template -void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const { +void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const +{ int row_from = range.start; int row_to = range.end - 1; Array3d dist_sums(temporal_window_size_, search_window_size_, search_window_size_); // for lazy calc optimization - Array4d col_dist_sums( - template_window_size_, temporal_window_size_, search_window_size_, search_window_size_); + Array4d col_dist_sums(template_window_size_, temporal_window_size_, search_window_size_, search_window_size_); int first_col_num = -1; + Array4d up_col_dist_sums(cols_, temporal_window_size_, search_window_size_, search_window_size_); - Array4d up_col_dist_sums( - cols_, temporal_window_size_, search_window_size_, search_window_size_); - - for (int i = row_from; i <= row_to; i++) { - for (int j = 0; j < cols_; j++) { + for (int i = row_from; i <= row_to; i++) + { + for (int j = 0; j < cols_; j++) + { int search_window_y = i - search_window_half_size_; int search_window_x = j - search_window_half_size_; // calc dist_sums - if (j == 0) { + if (j == 0) + { calcDistSumsForFirstElementInRow(i, dist_sums, col_dist_sums, up_col_dist_sums); first_col_num = 0; - - } else { // calc cur dist_sums using previous dist_sums - if (i == row_from) { + } + else + { + // calc cur dist_sums using previous dist_sums + if (i == row_from) + { calcDistSumsForElementInFirstRow(i, j, first_col_num, dist_sums, col_dist_sums, up_col_dist_sums); - } else { + } + else + { int ay = border_size_ + i; int ax = border_size_ + j + template_window_half_size_; @@ -217,36 +213,31 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const // copy class member to local variable for optimization int search_window_size = search_window_size_; - for (int d = 0; d < temporal_window_size_; d++) { + for (int d = 0; d < temporal_window_size_; d++) + { Mat cur_extended_src = extended_srcs_[d]; Array2d cur_dist_sums = dist_sums[d]; Array2d cur_col_dist_sums = col_dist_sums[first_col_num][d]; Array2d cur_up_col_dist_sums = up_col_dist_sums[j][d]; - for (int y = 0; y < search_window_size; y++) { + for (int y = 0; y < search_window_size; y++) + { int* dist_sums_row = cur_dist_sums.row_ptr(y); int* col_dist_sums_row = cur_col_dist_sums.row_ptr(y); - int* up_col_dist_sums_row = cur_up_col_dist_sums.row_ptr(y); - const T* b_up_ptr = - cur_extended_src.ptr(start_by - template_window_half_size_ - 1 + y); - const T* b_down_ptr = - cur_extended_src.ptr(start_by + template_window_half_size_ + y); + const T* b_up_ptr = cur_extended_src.ptr(start_by - template_window_half_size_ - 1 + y); + const T* b_down_ptr = cur_extended_src.ptr(start_by + template_window_half_size_ + y); - for (int x = 0; x < search_window_size; x++) { + for (int x = 0; x < search_window_size; x++) + { dist_sums_row[x] -= col_dist_sums_row[x]; col_dist_sums_row[x] = up_col_dist_sums_row[x] + - calcUpDownDist( - a_up, a_down, - b_up_ptr[start_bx + x], b_down_ptr[start_bx + x] - ); + calcUpDownDist(a_up, a_down, b_up_ptr[start_bx + x], b_down_ptr[start_bx + x]); dist_sums_row[x] += col_dist_sums_row[x]; - up_col_dist_sums_row[x] = col_dist_sums_row[x]; - } } } @@ -259,19 +250,21 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const int weights_sum = 0; int estimation[3]; - for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) { + for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) estimation[channel_num] = 0; - } - for (int d = 0; d < temporal_window_size_; d++) { + + for (int d = 0; d < temporal_window_size_; d++) + { const Mat& esrc_d = extended_srcs_[d]; - for (int y = 0; y < search_window_size_; y++) { + for (int y = 0; y < search_window_size_; y++) + { const T* cur_row_ptr = esrc_d.ptr(border_size_ + search_window_y + y); int* dist_sums_row = dist_sums.row_ptr(d, y); - for (int x = 0; x < search_window_size_; x++) { - int almostAvgDist = - dist_sums_row[x] >> almost_template_window_size_sq_bin_shift; + for (int x = 0; x < search_window_size_; x++) + { + int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift; int weight = almost_dist2weight[almostAvgDist]; weights_sum += weight; @@ -293,21 +286,19 @@ void FastNlMeansMultiDenoisingInvoker::operator() (const Range& range) const template inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForFirstElementInRow( - int i, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const + int i, Array3d& dist_sums, Array4d& col_dist_sums, Array4d& up_col_dist_sums) const { int j = 0; - for (int d = 0; d < temporal_window_size_; d++) { + for (int d = 0; d < temporal_window_size_; d++) + { Mat cur_extended_src = extended_srcs_[d]; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[d][y][x] = 0; - for (int tx = 0; tx < template_window_size_; tx++) { + for (int tx = 0; tx < template_window_size_; tx++) col_dist_sums[tx][d][y][x] = 0; - } int start_y = i + y - search_window_half_size_; int start_x = j + x - search_window_half_size_; @@ -315,14 +306,13 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForFirstElementInRo int* dist_sums_ptr = &dist_sums[d][y][x]; int* col_dist_sums_ptr = &col_dist_sums[0][d][y][x]; int col_dist_sums_step = col_dist_sums.step_size(0); - for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) { - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { + for (int tx = -template_window_half_size_; tx <= template_window_half_size_; tx++) + { + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) + { int dist = calcDist( - main_extended_src_.at( - border_size_ + i + ty, border_size_ + j + tx), - cur_extended_src.at( - border_size_ + start_y + ty, border_size_ + start_x + tx) - ); + main_extended_src_.at(border_size_ + i + ty, border_size_ + j + tx), + cur_extended_src.at(border_size_ + start_y + ty, border_size_ + start_x + tx)); *dist_sums_ptr += dist; *col_dist_sums_ptr += dist; @@ -332,18 +322,13 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForFirstElementInRo up_col_dist_sums[j][d][y][x] = col_dist_sums[template_window_size_ - 1][d][y][x]; } - } } } template inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, - Array3d& dist_sums, - Array4d& col_dist_sums, - Array4d& up_col_dist_sums) const + int i, int j, int first_col_num, Array3d& dist_sums, + Array4d& col_dist_sums, Array4d& up_col_dist_sums) const { int ay = border_size_ + i; int ax = border_size_ + j + template_window_half_size_; @@ -353,10 +338,12 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForElementInFirstRo int new_last_col_num = first_col_num; - for (int d = 0; d < temporal_window_size_; d++) { + for (int d = 0; d < temporal_window_size_; d++) + { Mat cur_extended_src = extended_srcs_[d]; - for (int y = 0; y < search_window_size_; y++) { - for (int x = 0; x < search_window_size_; x++) { + for (int y = 0; y < search_window_size_; y++) + for (int x = 0; x < search_window_size_; x++) + { dist_sums[d][y][x] -= col_dist_sums[first_col_num][d][y][x]; col_dist_sums[new_last_col_num][d][y][x] = 0; @@ -364,19 +351,17 @@ inline void FastNlMeansMultiDenoisingInvoker::calcDistSumsForElementInFirstRo int bx = start_bx + x; int* col_dist_sums_ptr = &col_dist_sums[new_last_col_num][d][y][x]; - for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) { - *col_dist_sums_ptr += - calcDist( - main_extended_src_.at(ay + ty, ax), - cur_extended_src.at(by + ty, bx) - ); + for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) + { + *col_dist_sums_ptr += calcDist( + main_extended_src_.at(ay + ty, ax), + cur_extended_src.at(by + ty, bx)); } dist_sums[d][y][x] += col_dist_sums[new_last_col_num][d][y][x]; up_col_dist_sums[j][d][y][x] = col_dist_sums[new_last_col_num][d][y][x]; } - } } } From d27068f79ada4d1a344d2e2381ee239af2946178 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 13 Feb 2014 15:15:05 +0400 Subject: [PATCH 02/11] some more refactoring --- modules/photo/src/arrays.hpp | 84 ++++++++++++------- modules/photo/src/denoising.cpp | 25 +++--- .../src/fast_nlmeans_denoising_invoker.hpp | 53 +++++------- ...fast_nlmeans_denoising_invoker_commons.hpp | 2 +- 4 files changed, 86 insertions(+), 78 deletions(-) diff --git a/modules/photo/src/arrays.hpp b/modules/photo/src/arrays.hpp index a33018e59..4aec5f7a1 100644 --- a/modules/photo/src/arrays.hpp +++ b/modules/photo/src/arrays.hpp @@ -39,10 +39,14 @@ // //M*/ +#include "opencv2/core/base.hpp" + #ifndef __OPENCV_DENOISING_ARRAYS_HPP__ #define __OPENCV_DENOISING_ARRAYS_HPP__ -template struct Array2d { +template +struct Array2d +{ T* a; int n1,n2; bool needToDeallocArray; @@ -50,14 +54,16 @@ template struct Array2d { Array2d(const Array2d& array2d): a(array2d.a), n1(array2d.n1), n2(array2d.n2), needToDeallocArray(false) { - if (array2d.needToDeallocArray) { - // copy constructor for self allocating arrays not supported - throw new std::exception(); + if (array2d.needToDeallocArray) + { + CV_Error(Error::BadDataPtr, "Copy constructor for self allocating arrays not supported"); } } Array2d(T* _a, int _n1, int _n2): - a(_a), n1(_n1), n2(_n2), needToDeallocArray(false) {} + a(_a), n1(_n1), n2(_n2), needToDeallocArray(false) + { + } Array2d(int _n1, int _n2): n1(_n1), n2(_n2), needToDeallocArray(true) @@ -65,28 +71,34 @@ template struct Array2d { a = new T[n1*n2]; } - ~Array2d() { - if (needToDeallocArray) { + ~Array2d() + { + if (needToDeallocArray) delete[] a; - } } - T* operator [] (int i) { + T* operator [] (int i) + { return a + i*n2; } - inline T* row_ptr(int i) { + inline T* row_ptr(int i) + { return (*this)[i]; } }; -template struct Array3d { +template +struct Array3d +{ T* a; int n1,n2,n3; bool needToDeallocArray; Array3d(T* _a, int _n1, int _n2, int _n3): - a(_a), n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(false) {} + a(_a), n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(false) + { + } Array3d(int _n1, int _n2, int _n3): n1(_n1), n2(_n2), n3(_n3), needToDeallocArray(true) @@ -94,64 +106,72 @@ template struct Array3d { a = new T[n1*n2*n3]; } - ~Array3d() { - if (needToDeallocArray) { + ~Array3d() + { + if (needToDeallocArray) delete[] a; - } } - Array2d operator [] (int i) { + Array2d operator [] (int i) + { Array2d array2d(a + i*n2*n3, n2, n3); return array2d; } - inline T* row_ptr(int i1, int i2) { + inline T* row_ptr(int i1, int i2) + { return a + i1*n2*n3 + i2*n3; } }; -template struct Array4d { +template +struct Array4d +{ T* a; int n1,n2,n3,n4; bool needToDeallocArray; int steps[4]; - void init_steps() { + void init_steps() + { steps[0] = n2*n3*n4; steps[1] = n3*n4; steps[2] = n4; steps[3] = 1; } - Array4d(T* _a, int _n1, int _n2, int _n3, int _n4): + Array4d(T* _a, int _n1, int _n2, int _n3, int _n4) : a(_a), n1(_n1), n2(_n2), n3(_n3), n4(_n4), needToDeallocArray(false) - { + { init_steps(); - } + } - Array4d(int _n1, int _n2, int _n3, int _n4): + Array4d(int _n1, int _n2, int _n3, int _n4) : n1(_n1), n2(_n2), n3(_n3), n4(_n4), needToDeallocArray(true) { a = new T[n1*n2*n3*n4]; init_steps(); - } - - ~Array4d() { - if (needToDeallocArray) { - delete[] a; - } } - Array3d operator [] (int i) { + ~Array4d() + { + if (needToDeallocArray) + delete[] a; + } + + Array3d operator [] (int i) + { Array3d array3d(a + i*n2*n3*n4, n2, n3, n4); return array3d; } - inline T* row_ptr(int i1, int i2, int i3) { + inline T* row_ptr(int i1, int i2, int i3) + { return a + i1*n2*n3*n4 + i2*n3*n4 + i3*n4; } - inline int step_size(int dimension) { + inline int step_size(int dimension) + { return steps[dimension]; } }; diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp index 166d7f892..c9a711b32 100644 --- a/modules/photo/src/denoising.cpp +++ b/modules/photo/src/denoising.cpp @@ -117,7 +117,8 @@ static void fastNlMeansDenoisingMultiCheckPreconditions( int templateWindowSize, int searchWindowSize) { int src_imgs_size = static_cast(srcImgs.size()); - if (src_imgs_size == 0) { + if (src_imgs_size == 0) + { CV_Error(Error::StsBadArg, "Input images vector should not be empty!"); } @@ -136,11 +137,11 @@ static void fastNlMeansDenoisingMultiCheckPreconditions( "should be choosen corresponding srcImgs size!"); } - for (int i = 1; i < src_imgs_size; i++) { - if (srcImgs[0].size() != srcImgs[i].size() || srcImgs[0].type() != srcImgs[i].type()) { + for (int i = 1; i < src_imgs_size; i++) + if (srcImgs[0].size() != srcImgs[i].size() || srcImgs[0].type() != srcImgs[i].type()) + { CV_Error(Error::StsBadArg, "Input images should have the same size and type!"); } - } } void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _dst, @@ -152,12 +153,13 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds fastNlMeansDenoisingMultiCheckPreconditions( srcImgs, imgToDenoiseIndex, - temporalWindowSize, templateWindowSize, searchWindowSize - ); + temporalWindowSize, templateWindowSize, searchWindowSize); + _dst.create(srcImgs[0].size(), srcImgs[0].type()); Mat dst = _dst.getMat(); - switch (srcImgs[0].type()) { + switch (srcImgs[0].type()) + { case CV_8U: parallel_for_(cv::Range(0, srcImgs[0].rows), FastNlMeansMultiDenoisingInvoker( @@ -192,15 +194,15 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr fastNlMeansDenoisingMultiCheckPreconditions( srcImgs, imgToDenoiseIndex, - temporalWindowSize, templateWindowSize, searchWindowSize - ); + temporalWindowSize, templateWindowSize, searchWindowSize); _dst.create(srcImgs[0].size(), srcImgs[0].type()); Mat dst = _dst.getMat(); int src_imgs_size = static_cast(srcImgs.size()); - if (srcImgs[0].type() != CV_8UC3) { + if (srcImgs[0].type() != CV_8UC3) + { CV_Error(Error::StsBadArg, "Type of input images should be CV_8UC3!"); return; } @@ -211,7 +213,8 @@ void cv::fastNlMeansDenoisingColoredMulti( InputArrayOfArrays _srcImgs, OutputAr std::vector src_lab(src_imgs_size); std::vector l(src_imgs_size); std::vector ab(src_imgs_size); - for (int i = 0; i < src_imgs_size; i++) { + for (int i = 0; i < src_imgs_size; i++) + { src_lab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC3); l[i] = Mat::zeros(srcImgs[0].size(), CV_8UC1); ab[i] = Mat::zeros(srcImgs[0].size(), CV_8UC2); diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp index b274c6889..b8f5a0392 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp @@ -101,7 +101,7 @@ inline int getNearestPowerOf2(int value) template FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( - const cv::Mat& src, cv::Mat& dst, + const Mat& src, Mat& dst, int template_window_size, int search_window_size, const float h) : @@ -115,22 +115,20 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( search_window_size_ = search_window_half_size_ * 2 + 1; border_size_ = search_window_half_size_ + template_window_half_size_; - copyMakeBorder(src_, extended_src_, - border_size_, border_size_, border_size_, border_size_, cv::BORDER_DEFAULT); + copyMakeBorder(src_, extended_src_, border_size_, border_size_, border_size_, border_size_, BORDER_DEFAULT); const int max_estimate_sum_value = search_window_size_ * search_window_size_ * 255; fixed_point_mult_ = std::numeric_limits::max() / max_estimate_sum_value; // precalc weight for every possible l2 dist between blocks // additional optimization of precalced weights to replace division(averaging) by binary shift - - CV_Assert(template_window_size_ <= 46340 ); // sqrt(INT_MAX) + CV_Assert(template_window_size_ <= 46340); // sqrt(INT_MAX) int template_window_size_sq = template_window_size_ * template_window_size_; almost_template_window_size_sq_bin_shift_ = getNearestPowerOf2(template_window_size_sq); double almost_dist2actual_dist_multiplier = ((double)(1 << almost_template_window_size_sq_bin_shift_)) / template_window_size_sq; int max_dist = 255 * 255 * sizeof(T); - int almost_max_dist = (int) (max_dist / almost_dist2actual_dist_multiplier + 1); + int almost_max_dist = (int)(max_dist / almost_dist2actual_dist_multiplier + 1); almost_dist2weight_.resize(almost_max_dist); const double WEIGHT_THRESHOLD = 0.001; @@ -157,12 +155,14 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const int row_from = range.start; int row_to = range.end - 1; + // sums of cols anf rows for current pixel p Array2d dist_sums(search_window_size_, search_window_size_); - // for lazy calc optimization + // for lazy calc optimization (sum of cols for current pixel) Array3d col_dist_sums(template_window_size_, search_window_size_, search_window_size_); int first_col_num = -1; + // last elements of column sum (for each element in row) Array3d up_col_dist_sums(src_.cols, search_window_size_, search_window_size_); for (int i = row_from; i <= row_to; i++) @@ -177,7 +177,6 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { calcDistSumsForFirstElementInRow(i, dist_sums, col_dist_sums, up_col_dist_sums); first_col_num = 0; - } else { @@ -186,7 +185,6 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const { calcDistSumsForElementInFirstRow(i, j, first_col_num, dist_sums, col_dist_sums, up_col_dist_sums); - } else { @@ -204,29 +202,23 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const for (int y = 0; y < search_window_size; y++) { - int* dist_sums_row = dist_sums.row_ptr(y); + int * dist_sums_row = dist_sums.row_ptr(y); + int * col_dist_sums_row = col_dist_sums.row_ptr(first_col_num, y); + int * up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y); - int* col_dist_sums_row = col_dist_sums.row_ptr(first_col_num,y); - int* up_col_dist_sums_row = up_col_dist_sums.row_ptr(j, y); - - const T* b_up_ptr = extended_src_.ptr(start_by - template_window_half_size_ - 1 + y); - const T* b_down_ptr = extended_src_.ptr(start_by + template_window_half_size_ + y); + const T * b_up_ptr = extended_src_.ptr(start_by - template_window_half_size_ - 1 + y); + const T * b_down_ptr = extended_src_.ptr(start_by + template_window_half_size_ + y); for (int x = 0; x < search_window_size; x++) { + // remove from current pixel sum column sum with index "first_col_num" dist_sums_row[x] -= col_dist_sums_row[x]; - col_dist_sums_row[x] = - up_col_dist_sums_row[x] + - calcUpDownDist( - a_up, a_down, - b_up_ptr[start_bx + x], b_down_ptr[start_bx + x] - ); + int bx = start_bx + x; + col_dist_sums_row[x] = up_col_dist_sums_row[x] + calcUpDownDist(a_up, a_down, b_up_ptr[bx], b_down_ptr[bx]); dist_sums_row[x] += col_dist_sums_row[x]; - up_col_dist_sums_row[x] = col_dist_sums_row[x]; - } } } @@ -235,9 +227,7 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const } // calc weights - int weights_sum = 0; - - int estimation[3]; + int estimation[3], weights_sum = 0; for (size_t channel_num = 0; channel_num < sizeof(T); channel_num++) estimation[channel_num] = 0; @@ -247,9 +237,7 @@ void FastNlMeansDenoisingInvoker::operator() (const Range& range) const int* dist_sums_row = dist_sums.row_ptr(y); for (int x = 0; x < search_window_size_; x++) { - int almostAvgDist = - dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_; - + int almostAvgDist = dist_sums_row[x] >> almost_template_window_size_sq_bin_shift_; int weight = almost_dist2weight_[almostAvgDist]; weights_sum += weight; @@ -302,9 +290,7 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForFirstElementInRow( template inline void FastNlMeansDenoisingInvoker::calcDistSumsForElementInFirstRow( - int i, - int j, - int first_col_num, + int i, int j, int first_col_num, Array2d& dist_sums, Array3d& col_dist_sums, Array3d& up_col_dist_sums) const @@ -326,8 +312,7 @@ inline void FastNlMeansDenoisingInvoker::calcDistSumsForElementInFirstRow( int by = start_by + y; int bx = start_bx + x; for (int ty = -template_window_half_size_; ty <= template_window_half_size_; ty++) - col_dist_sums[new_last_col_num][y][x] += - calcDist(extended_src_, ay + ty, ax, by + ty, bx); + col_dist_sums[new_last_col_num][y][x] += calcDist(extended_src_, ay + ty, ax, by + ty, bx); dist_sums[y][x] += col_dist_sums[new_last_col_num][y][x]; up_col_dist_sums[j][y][x] = col_dist_sums[new_last_col_num][y][x]; diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp index 3e8f4c498..7a2650077 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp @@ -70,7 +70,7 @@ template static inline int calcDist(const Mat& m, int i1, int j1, i template static inline int calcUpDownDist(T a_up, T a_down, T b_up, T b_down) { - return calcDist(a_down,b_down) - calcDist(a_up, b_up); + return calcDist(a_down, b_down) - calcDist(a_up, b_up); } template <> inline int calcUpDownDist(uchar a_up, uchar a_down, uchar b_up, uchar b_down) From 891dbeabbabf586be357c4897d0dbcca81185d0a Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Sun, 16 Feb 2014 21:54:51 +0400 Subject: [PATCH 03/11] implemented OpenCL version of cv::fastNlMeansDenoising --- modules/photo/src/denoising.cpp | 7 +- .../src/fast_nlmeans_denoising_opencl.hpp | 128 +++++++++ modules/photo/src/opencl/nlmeans.cl | 249 ++++++++++++++++++ modules/photo/src/precomp.hpp | 2 + modules/photo/test/ocl/test_denoising.cpp | 105 ++++++++ 5 files changed, 489 insertions(+), 2 deletions(-) create mode 100644 modules/photo/src/fast_nlmeans_denoising_opencl.hpp create mode 100644 modules/photo/src/opencl/nlmeans.cl create mode 100644 modules/photo/test/ocl/test_denoising.cpp diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp index c9a711b32..a3aa0f41f 100644 --- a/modules/photo/src/denoising.cpp +++ b/modules/photo/src/denoising.cpp @@ -40,14 +40,17 @@ //M*/ #include "precomp.hpp" -#include "opencv2/photo.hpp" -#include "opencv2/imgproc.hpp" + #include "fast_nlmeans_denoising_invoker.hpp" #include "fast_nlmeans_multi_denoising_invoker.hpp" +#include "fast_nlmeans_denoising_opencl.hpp" void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h, int templateWindowSize, int searchWindowSize) { + CV_OCL_RUN(_src.dims() <= 2 && (_src.isUMat() || _dst.isUMat()), + ocl_fastNlMeansDenoising(_src, _dst, h, templateWindowSize, searchWindowSize)) + Mat src = _src.getMat(); _dst.create(src.size(), src.type()); Mat dst = _dst.getMat(); diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp new file mode 100644 index 000000000..404a59ee0 --- /dev/null +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -0,0 +1,128 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__ +#define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__ + +#include "precomp.hpp" + +#define CV_OPENCL_RUN_ASSERT +#include "opencl_kernels.hpp" + +namespace cv { + +enum +{ + BLOCK_ROWS = 32, + BLOCK_COLS = 128, + CTA_SIZE = 128 +}; + +static inline int getNearestPowerOf2(int value) +{ + int p = 0; + while (1 << p < value) + ++p; + return p; +} + +static int divUp(int a, int b) +{ + return (a + b - 1) / b; +} + +static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, float h, int cn, + int & almostTemplateWindowSizeSqBinShift) +{ + const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255; + int fixedPointMult = std::numeric_limits::max() / maxEstimateSumValue; + + // precalc weight for every possible l2 dist between blocks + // additional optimization of precalced weights to replace division(averaging) by binary shift + CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX) + int templateWindowSizeSq = templateWindowSize * templateWindowSize; + almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq); + float almostDist2ActualDistMultiplier = (float)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; + + const float WEIGHT_THRESHOLD = 1e-3f; + int maxDist = 255 * 255 * cn; + int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); + float den = 1.0f / (h * h * cn); + + almostDist2Weight.create(1, almostMaxDist, CV_32SC1); + + ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc, + "-D OP_CALC_WEIGHTS"); + if (k.empty()) + return false; + + k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist, + almostDist2ActualDistMultiplier, fixedPointMult, den, WEIGHT_THRESHOLD); + + size_t globalsize[1] = { almostMaxDist }; + return k.run(1, globalsize, NULL, false); +} + +static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, + int templateWindowSize, int searchWindowSize) +{ + int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + Size size = _src.size(); + + if ( !(depth == CV_8U && cn <= 4 && cn != 3) ) + return false; + + int templateWindowHalfWize = templateWindowSize / 2; + int searchWindowHalfSize = searchWindowSize / 2; + templateWindowSize = templateWindowHalfWize * 2 + 1; + searchWindowSize = searchWindowHalfSize * 2 + 1; + int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS); + int almostTemplateWindowSizeSqBinShift = -1; + + char cvt[2][40]; + String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d" + " -D uchar_t=%s -D int_t=%s -D BLOCK_COLS=%d -D BLOCK_ROWS=%d" + " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d" + " -D convert_int_t=%s -D cn=%d -D CTA_SIZE2=%d -D convert_uchar_t=%s", + templateWindowSize, searchWindowSize, ocl::typeToStr(type), + ocl::typeToStr(CV_32SC(cn)), BLOCK_COLS, BLOCK_ROWS, CTA_SIZE, + templateWindowHalfWize, searchWindowHalfSize, + ocl::convertTypeStr(CV_8U, CV_32S, cn, cvt[0]), cn, + CTA_SIZE >> 1, ocl::convertTypeStr(CV_32S, CV_8U, cn, cvt[1])); + + ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts); + if (k.empty()) + return false; + + UMat almostDist2Weight; + if (!ocl_calcAlmostDist2Weight(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn, + almostTemplateWindowSizeSqBinShift)) + return false; + CV_Assert(almostTemplateWindowSizeSqBinShift >= 0); + + UMat srcex; + int borderSize = searchWindowHalfSize + templateWindowHalfWize; + copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT); + + _dst.create(size, type); + UMat dst = _dst.getUMat(); + + Size upColSumSize(size.width, searchWindowSize * searchWindowSize * nblocksy); + Size colSumSize(nblocksx * templateWindowSize, searchWindowSize * searchWindowSize * nblocksy); + UMat buffer(upColSumSize + colSumSize, CV_32SC(cn)); + + k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst), + ocl::KernelArg::PtrReadOnly(almostDist2Weight), nblocksy, nblocksx, + ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); + + size_t globalsize[2] = { nblocksx, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; + return k.run(2, globalsize, localsize, false); +} + +} + +#endif diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl new file mode 100644 index 000000000..1b40517ab --- /dev/null +++ b/modules/photo/src/opencl/nlmeans.cl @@ -0,0 +1,249 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. + +#ifdef OP_CALC_WEIGHTS + +__kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, + float almostDist2ActualDistMultiplier, int fixedPointMult, + float den, float WEIGHT_THRESHOLD) +{ + int almostDist = get_global_id(0); + + if (almostDist < almostMaxDist) + { + float dist = almostDist * almostDist2ActualDistMultiplier; + int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); + + if (weight < WEIGHT_THRESHOLD * fixedPointMult) + weight = 0; + + almostDist2Weight[almostDist] = weight; + } +} + +#elif defined OP_CALC_FASTNLMEANS + +#define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE) + +inline int_t calcDist(uchar_t a, uchar_t b) +{ + int_t diff = convert_int_t(a) -convert_int_t(b); + return diff * diff; +} + +inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset, + __local int_t * dists, int y, int x, int id, + __global int_t * col_dists, __global int_t * up_col_dists) +{ + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + + for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + { + int_t dist = (int_t)(0), value; + + sx += i % SEARCH_SIZE; + sy += i / SEARCH_SIZE; + + __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset))); + __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); + __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; + + #pragma unroll + for (int j = 0; j < TEMPLATE_SIZE; ++j) + col_dists_current[j] = (int_t)(0); + + #pragma unroll + for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) + { + #pragma unroll + for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx) + { + value = calcDist(src_template[tx], src_current[tx]); + + col_dists_current[tx + TEMPLATE_SIZE2] += value; + dist += value; + } + + src_current += src_step; + src_template += src_step; + } + + dists[i] = dist; + up_col_dists[i] = col_dists[TEMPLATE_SIZE - 1]; + } +} + +inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, + __local int_t * dists, int y, int x, int id, int first, + __global int_t * col_dists, __global int_t * up_col_dists) +{ + x += TEMPLATE_SIZE2; + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + + for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + { + sx += i % SEARCH_SIZE; + sy += i / SEARCH_SIZE; + + __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); + __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset))); + __global int_t * col_dists_current = col_dists + TEMPLATE_SIZE * i; + + int_t value; + dists[id] -= col_dists_current[first]; + col_dists_current[first] = (int_t)(0); + + #pragma unroll + for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) + { + value = calcDist(src_current[0], src_template[0]); + col_dists_current[first] += value; + + src_current += src_step; + src_template += src_step; + } + + dists[id] += col_dists_current[first]; + up_col_dists[id] = col_dists_current[first]; + } +} + +inline void calcElement(__global const uchar * src, int src_step, int src_offset, + __local int_t * dists, int y, int x, int id, int first, + __global int_t * col_dists, __global int_t * up_col_dists) +{ + int sx_up = x + TEMPLATE_SIZE2, sy_up = y - TEMPLATE_SIZE2 - 1; + int sx_down = x + TEMPLATE_SIZE2, sy_down = y + TEMPLATE_SIZE2; + + uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); + uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); + + for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + { + int wx = i % SEARCH_SIZE; + int wy = i / SEARCH_SIZE; + + sx_up += wx, sx_down += wx; + sy_up += wy, sy_down += wy; + + uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); + uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); + + __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; + __global int_t * up_col_dists_current = up_col_dists + i; + + dists[i] -= col_dists_current[first]; + col_dists_current[first] = up_col_dists_current[id] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); + dists[i] += col_dists_current[first]; + up_col_dists_current[id] = col_dists_current[first]; + } +} + +inline void convolveWindow(__global const uchar * src, int src_step, int src_offset, + __local int * dists, __global const int * almostDist2Weight, + __global uchar * dst, int dst_step, int dst_offset, + int y, int x, int id, __local int * weights_local, + __local int * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) +{ + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0; + int_t weighted_sum = (int_t)(0); + + for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += id) + { + int src_index = mad24(sy + i / SEARCH_SIZE, src_step, (i % SEARCH_SIZE + sx) * cn + src_offset); + __global const uchar_t * src_search = (__global const uchar_t *)(src + src_index); + + int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; + int weight = almostDist2Weight[almostAvgDist]; + + weights += weight; + weighted_sum += (int_t)(weight) * convert_int_t(src_search[0]); + } + + if (id >= CTA_SIZE2) + { + weights_local[id - CTA_SIZE2] = weights; + weighted_sum_local[id - CTA_SIZE2] = weighted_sum; + } + barrier(CLK_LOCAL_MEM_FENCE); + + if (id < CTA_SIZE2) + { + weights_local[id] += weights; + weighted_sum_local[id] += weighted_sum; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int lsize = CTA_SIZE2 >> 1; lsize >= 4; lsize >>= 1) + { + if (id < lsize) + { + int id2 = lsize + id; + weights_local[id] = weights + weights_local[id2]; + weighted_sum_local[id] = weighted_sum + weighted_sum_local[id2]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (id == 0) + { + int dst_index = mad24(y, dst_step, dst_offset + x * cn); + + int_t weights_local_0 = (int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]); + int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3]; + + *(__global uchar_t *)(dst + dst_index) = convert_uchar_t((weighted_sum_local_0 + weights_local_0 >> 1) / weights_local_0); + } +} + +__kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, + __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, + __global const int * almostDist2Weight, int nblocksy, int nblocksx, + __global uchar * buffer, int almostTemplateWindowSizeSqBinShift) +{ + int block_x = get_global_id(0); + int block_y = get_global_id(1); + int id = get_local_id(0), first; + + __local int_t dists[SEARCH_SIZE_SQ], weighted_sum[CTA_SIZE2]; + __local int weights[CTA_SIZE2]; + + int block_data_start = mad24(block_y, nblocksx, block_x) * SEARCH_SIZE_SQ * (TEMPLATE_SIZE + BLOCK_COLS); + __global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t)); + __global int_t * up_col_dists = (__global int_t *)(buffer + sizeof(int_t) * (block_data_start + SEARCH_SIZE_SQ * TEMPLATE_SIZE)); + + if (block_x < nblocksx && block_y < nblocksy) + { + int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); + int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); + + for (int y = y0; y < y1; ++y) + for (int x = x0; x < x1; ++x) + { + if (x == x0) + { + calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); + first = 0; + } + else + { + if (y == y0) + calcElementInFirstRow(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists); + else + { + calcElement(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists); + first = (first + 1) % TEMPLATE_SIZE; + } + + convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, + y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift); + } + } + } +} + +#endif diff --git a/modules/photo/src/precomp.hpp b/modules/photo/src/precomp.hpp index 38ac3ffcd..4355f13c0 100644 --- a/modules/photo/src/precomp.hpp +++ b/modules/photo/src/precomp.hpp @@ -46,6 +46,8 @@ #include "opencv2/core/private.hpp" #include "opencv2/core/utility.hpp" #include "opencv2/photo.hpp" +#include "opencv2/core/ocl.hpp" +#include "opencv2/imgproc.hpp" #ifdef HAVE_TEGRA_OPTIMIZATION #include "opencv2/photo/photo_tegra.hpp" diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp new file mode 100644 index 000000000..0554b3067 --- /dev/null +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -0,0 +1,105 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "test_precomp.hpp" +#include "opencv2/ts/ocl_test.hpp" + +#ifdef HAVE_OPENCL + +namespace cvtest { +namespace ocl { + +PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool) +{ + int cn, templateWindowSize, searchWindowSize; + float h; + bool use_roi; + + TEST_DECLARE_INPUT_PARAMETER(src) + TEST_DECLARE_OUTPUT_PARAMETER(dst) + + virtual void SetUp() + { + cn = GET_PARAM(0); + use_roi = GET_PARAM(1); + + templateWindowSize = 7; + searchWindowSize = 21; + h = 3.0f; + } + + virtual void generateTestData() + { + const int type = CV_8UC(cn); + + Size roiSize = randomSize(1, MAX_VALUE); + Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); + randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255); + + Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); + randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255); + + UMAT_UPLOAD_INPUT_PARAMETER(src) + UMAT_UPLOAD_OUTPUT_PARAMETER(dst) + } +}; + +typedef FastNlMeansDenoisingTestBase FastNlMeansDenoising; + +OCL_TEST_P(FastNlMeansDenoising, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + generateTestData(); + + OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize)); + OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize)); + + OCL_EXPECT_MATS_NEAR(dst, 1) + } +} + +OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values((Channels)1), Bool())); + +} } // namespace cvtest::ocl + +#endif // HAVE_OPENCL From 9db28f332acf53eb2001249af33919b4ac742377 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 17 Feb 2014 19:27:34 +0400 Subject: [PATCH 04/11] more fixes --- .../src/fast_nlmeans_denoising_invoker.hpp | 2 + .../src/fast_nlmeans_denoising_opencl.hpp | 10 +- modules/photo/src/opencl/nlmeans.cl | 179 ++++++++++-------- 3 files changed, 104 insertions(+), 87 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp index b8f5a0392..2ea09719e 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp @@ -137,6 +137,8 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( double dist = almost_dist * almost_dist2actual_dist_multiplier; int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); + printf("%d ", weight); + if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) weight = 0; diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index 404a59ee0..cd28c1489 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -111,15 +111,17 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, _dst.create(size, type); UMat dst = _dst.getUMat(); - Size upColSumSize(size.width, searchWindowSize * searchWindowSize * nblocksy); - Size colSumSize(nblocksx * templateWindowSize, searchWindowSize * searchWindowSize * nblocksy); + int searchWindowSizeSq = searchWindowSize * searchWindowSize; + Size upColSumSize(size.width, searchWindowSizeSq * nblocksy); + Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy); UMat buffer(upColSumSize + colSumSize, CV_32SC(cn)); + srcex = srcex(Rect(Point(borderSize, borderSize), size)); k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst), - ocl::KernelArg::PtrReadOnly(almostDist2Weight), nblocksy, nblocksx, + ocl::KernelArg::PtrReadOnly(almostDist2Weight), ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); - size_t globalsize[2] = { nblocksx, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; + size_t globalsize[2] = { nblocksx * BLOCK_COLS, nblocksy * BLOCK_ROWS }, localsize[2] = { CTA_SIZE, 1 }; return k.run(2, globalsize, localsize, false); } diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index 1b40517ab..807e89a62 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -5,6 +5,8 @@ // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. +#pragma OPENCL_EXTENSION cl_amd_printf:enable + #ifdef OP_CALC_WEIGHTS __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, @@ -18,6 +20,8 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost float dist = almostDist * almostDist2ActualDistMultiplier; int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); +// printf("%d ", weight); + if (weight < WEIGHT_THRESHOLD * fixedPointMult) weight = 0; @@ -31,7 +35,7 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost inline int_t calcDist(uchar_t a, uchar_t b) { - int_t diff = convert_int_t(a) -convert_int_t(b); + int_t diff = convert_int_t(a) - convert_int_t(b); return diff * diff; } @@ -39,16 +43,14 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int __local int_t * dists, int y, int x, int id, __global int_t * col_dists, __global int_t * up_col_dists) { - int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2 - TEMPLATE_SIZE2; - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { int_t dist = (int_t)(0), value; - sx += i % SEARCH_SIZE; - sy += i / SEARCH_SIZE; - - __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset))); + __global const uchar_t * src_template = (__global const uchar_t *)(src + + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; @@ -57,7 +59,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int col_dists_current[j] = (int_t)(0); #pragma unroll - for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) + for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) { #pragma unroll for (int tx = -TEMPLATE_SIZE2; tx <= TEMPLATE_SIZE2; ++tx) @@ -68,78 +70,86 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int dist += value; } - src_current += src_step; - src_template += src_step; + src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); + src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); } dists[i] = dist; - up_col_dists[i] = col_dists[TEMPLATE_SIZE - 1]; + up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1]; } } +#define COND if (i == 252 && x0 == 20) + inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, - __local int_t * dists, int y, int x, int id, int first, + __local int_t * dists, int y, int x0, int x, int id, int first, __global int_t * col_dists, __global int_t * up_col_dists) { x += TEMPLATE_SIZE2; + y -= TEMPLATE_SIZE2; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { - sx += i % SEARCH_SIZE; - sy += i / SEARCH_SIZE; - __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); - __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy, src_step, mad24(cn, x, src_offset))); + __global const uchar_t * src_template = (__global const uchar_t *)(src + + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); __global int_t * col_dists_current = col_dists + TEMPLATE_SIZE * i; - int_t value; - dists[id] -= col_dists_current[first]; - col_dists_current[first] = (int_t)(0); + int_t col_dist = (int_t)(0); #pragma unroll - for (int ty = -TEMPLATE_SIZE2; ty <= TEMPLATE_SIZE2; ++ty) + for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) { - value = calcDist(src_current[0], src_template[0]); - col_dists_current[first] += value; + col_dist += calcDist(src_current[0], src_template[0]); +// COND printf("%d\n", calcDist(src_current[0], src_template[0])); - src_current += src_step; - src_template += src_step; + src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); + src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); } - dists[id] += col_dists_current[first]; - up_col_dists[id] = col_dists_current[first]; + dists[i] += col_dist - col_dists_current[first]; + col_dists_current[first] = col_dist; + up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist; +// COND printf("res = %d\n", col_dist); } } inline void calcElement(__global const uchar * src, int src_step, int src_offset, - __local int_t * dists, int y, int x, int id, int first, + __local int_t * dists, int y, int x0, int x, int id, int first, __global int_t * col_dists, __global int_t * up_col_dists) { - int sx_up = x + TEMPLATE_SIZE2, sy_up = y - TEMPLATE_SIZE2 - 1; - int sx_down = x + TEMPLATE_SIZE2, sy_down = y + TEMPLATE_SIZE2; + int sx = x + TEMPLATE_SIZE2; + int sy_up = y - TEMPLATE_SIZE2 - 1 /*- TEMPLATE_SIZE*/; + int sy_down = y + TEMPLATE_SIZE2; - uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); - uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); + uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset))); + uchar_t down_value = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx, src_offset))); - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) + sx -= SEARCH_SIZE2; + sy_up -= SEARCH_SIZE2; + sy_down -= SEARCH_SIZE2; + + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { - int wx = i % SEARCH_SIZE; - int wy = i / SEARCH_SIZE; + int wx = i % SEARCH_SIZE, wy = i / SEARCH_SIZE; - sx_up += wx, sx_down += wx; - sy_up += wy, sy_down += wy; + uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset))); + uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset))); - uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx_up, src_offset))); - uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down, src_step, mad24(cn, sx_down, src_offset))); + __global int_t * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); + __global int_t * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); - __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; - __global int_t * up_col_dists_current = up_col_dists + i; +// COND printf("\nres = %d\n", up_col_dists_current[0]); +// COND printf("up = %d, down = %d\n", calcDist(up_value, up_value_t), calcDist(down_value, down_value_t)); + int_t col_dist = up_col_dists_current[0] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); - dists[i] -= col_dists_current[first]; - col_dists_current[first] = up_col_dists_current[id] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); - dists[i] += col_dists_current[first]; - up_col_dists_current[id] = col_dists_current[first]; + dists[i] += col_dist - col_dists_current[0]; + col_dists_current[0] = col_dist; + up_col_dists_current[0] = col_dist; + +// COND printf("res = %d\n", up_col_dists_current[0]); +// if (up_col_dists_current[0] < 0) printf("%d %d -- %d\n", i, x0, up_col_dists_current[0]); } } @@ -147,27 +157,28 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off __local int * dists, __global const int * almostDist2Weight, __global uchar * dst, int dst_step, int dst_offset, int y, int x, int id, __local int * weights_local, - __local int * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) + __local int_t * weighted_sum_local, int almostTemplateWindowSizeSqBinShift) { int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2, weights = 0; int_t weighted_sum = (int_t)(0); - for (int i = 0, size = SEARCH_SIZE_SQ; i < size; i += id) + for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { - int src_index = mad24(sy + i / SEARCH_SIZE, src_step, (i % SEARCH_SIZE + sx) * cn + src_offset); - __global const uchar_t * src_search = (__global const uchar_t *)(src + src_index); + int src_index = mad24(sy + i / SEARCH_SIZE, src_step, mad24(i % SEARCH_SIZE + sx, cn, src_offset)); + int_t src_value = convert_int_t(*(__global const uchar_t *)(src + src_index)); int almostAvgDist = dists[i] >> almostTemplateWindowSizeSqBinShift; int weight = almostDist2Weight[almostAvgDist]; weights += weight; - weighted_sum += (int_t)(weight) * convert_int_t(src_search[0]); + weighted_sum += (int_t)(weight) * src_value; } if (id >= CTA_SIZE2) { - weights_local[id - CTA_SIZE2] = weights; - weighted_sum_local[id - CTA_SIZE2] = weighted_sum; + int id2 = id - CTA_SIZE2; + weights_local[id2] = weights; + weighted_sum_local[id2] = weighted_sum; } barrier(CLK_LOCAL_MEM_FENCE); @@ -191,9 +202,9 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off if (id == 0) { - int dst_index = mad24(y, dst_step, dst_offset + x * cn); + int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset)); - int_t weights_local_0 = (int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]); + int_t weights_local_0 = (int_t)(1);//(int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]); int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3]; *(__global uchar_t *)(dst + dst_index) = convert_uchar_t((weighted_sum_local_0 + weights_local_0 >> 1) / weights_local_0); @@ -202,48 +213,50 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int src_offset, __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __global const int * almostDist2Weight, int nblocksy, int nblocksx, - __global uchar * buffer, int almostTemplateWindowSizeSqBinShift) + __global const int * almostDist2Weight, __global uchar * buffer, + int almostTemplateWindowSizeSqBinShift) { - int block_x = get_global_id(0); - int block_y = get_global_id(1); + int block_x = get_group_id(0), nblocks_x = get_num_groups(0); + int block_y = get_group_id(1); int id = get_local_id(0), first; __local int_t dists[SEARCH_SIZE_SQ], weighted_sum[CTA_SIZE2]; __local int weights[CTA_SIZE2]; - int block_data_start = mad24(block_y, nblocksx, block_x) * SEARCH_SIZE_SQ * (TEMPLATE_SIZE + BLOCK_COLS); + int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); + int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); + + // for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element + // and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row + int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE); __global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t)); - __global int_t * up_col_dists = (__global int_t *)(buffer + sizeof(int_t) * (block_data_start + SEARCH_SIZE_SQ * TEMPLATE_SIZE)); + __global int_t * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE; - if (block_x < nblocksx && block_y < nblocksy) - { - int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); - int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); - - for (int y = y0; y < y1; ++y) - for (int x = x0; x < x1; ++x) + for (int y = y0; y < y1; ++y) + for (int x = x0; x < x1; ++x) + { + barrier(CLK_LOCAL_MEM_FENCE); + if (x == x0) { - if (x == x0) - { - calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); - first = 0; - } + calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); + first = 0; + } + else + { + if (y == y0) + calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); else { - if (y == y0) - calcElementInFirstRow(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists); - else - { - calcElement(src, src_step, src_offset, dists, y, x, id, first, col_dists, up_col_dists); - first = (first + 1) % TEMPLATE_SIZE; - } - - convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, - y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift); + calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); + first = (first + 1) % TEMPLATE_SIZE; } } - } + + barrier(CLK_LOCAL_MEM_FENCE); + + convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, + y, x, id, weights, weighted_sum, almostTemplateWindowSizeSqBinShift); + } } #endif From 38cbe9873a8076116ec450f204ad93ef05293352 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 18 Feb 2014 00:06:32 +0400 Subject: [PATCH 05/11] small fixes --- .../src/fast_nlmeans_denoising_invoker.hpp | 2 -- modules/photo/src/opencl/nlmeans.cl | 36 ++++++------------- modules/photo/test/ocl/test_denoising.cpp | 6 +++- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp index 2ea09719e..b8f5a0392 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp @@ -137,8 +137,6 @@ FastNlMeansDenoisingInvoker::FastNlMeansDenoisingInvoker( double dist = almost_dist * almost_dist2actual_dist_multiplier; int weight = cvRound(fixed_point_mult_ * std::exp(-dist / (h * h * sizeof(T)))); - printf("%d ", weight); - if (weight < WEIGHT_THRESHOLD * fixed_point_mult_) weight = 0; diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index 807e89a62..f038e6ce1 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -20,8 +20,6 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost float dist = almostDist * almostDist2ActualDistMultiplier; int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); -// printf("%d ", weight); - if (weight < WEIGHT_THRESHOLD * fixedPointMult) weight = 0; @@ -43,7 +41,8 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int __local int_t * dists, int y, int x, int id, __global int_t * col_dists, __global int_t * up_col_dists) { - int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2 - TEMPLATE_SIZE2; + y -= TEMPLATE_SIZE2; + int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { @@ -79,8 +78,6 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int } } -#define COND if (i == 252 && x0 == 20) - inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, __local int_t * dists, int y, int x0, int x, int id, int first, __global int_t * col_dists, __global int_t * up_col_dists) @@ -102,7 +99,6 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) { col_dist += calcDist(src_current[0], src_template[0]); -// COND printf("%d\n", calcDist(src_current[0], src_template[0])); src_current = (__global const uchar_t *)((__global const uchar *)src_current + src_step); src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); @@ -111,7 +107,6 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int dists[i] += col_dist - col_dists_current[first]; col_dists_current[first] = col_dist; up_col_dists[mad24(x0, SEARCH_SIZE_SQ, i)] = col_dist; -// COND printf("res = %d\n", col_dist); } } @@ -120,7 +115,7 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset __global int_t * col_dists, __global int_t * up_col_dists) { int sx = x + TEMPLATE_SIZE2; - int sy_up = y - TEMPLATE_SIZE2 - 1 /*- TEMPLATE_SIZE*/; + int sy_up = y - TEMPLATE_SIZE2 - 1; int sy_down = y + TEMPLATE_SIZE2; uchar_t up_value = *(__global const uchar_t *)(src + mad24(sy_up, src_step, mad24(cn, sx, src_offset))); @@ -140,16 +135,11 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset __global int_t * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); __global int_t * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); -// COND printf("\nres = %d\n", up_col_dists_current[0]); -// COND printf("up = %d, down = %d\n", calcDist(up_value, up_value_t), calcDist(down_value, down_value_t)); int_t col_dist = up_col_dists_current[0] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); dists[i] += col_dist - col_dists_current[0]; col_dists_current[0] = col_dist; up_col_dists_current[0] = col_dist; - -// COND printf("res = %d\n", up_col_dists_current[0]); -// if (up_col_dists_current[0] < 0) printf("%d %d -- %d\n", i, x0, up_col_dists_current[0]); } } @@ -189,13 +179,13 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off } barrier(CLK_LOCAL_MEM_FENCE); - for (int lsize = CTA_SIZE2 >> 1; lsize >= 4; lsize >>= 1) + for (int lsize = CTA_SIZE2 >> 1; lsize > 0; lsize >>= 1) { if (id < lsize) { int id2 = lsize + id; - weights_local[id] = weights + weights_local[id2]; - weighted_sum_local[id] = weighted_sum + weighted_sum_local[id2]; + weights_local[id] += weights_local[id2]; + weighted_sum_local[id] += weighted_sum_local[id2]; } barrier(CLK_LOCAL_MEM_FENCE); } @@ -203,11 +193,7 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off if (id == 0) { int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset)); - - int_t weights_local_0 = (int_t)(1);//(int_t)(weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]); - int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + weighted_sum_local[2] + weighted_sum_local[3]; - - *(__global uchar_t *)(dst + dst_index) = convert_uchar_t((weighted_sum_local_0 + weights_local_0 >> 1) / weights_local_0); + *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local[0]); } } @@ -235,7 +221,7 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int for (int y = y0; y < y1; ++y) for (int x = x0; x < x1; ++x) { - barrier(CLK_LOCAL_MEM_FENCE); + // barrier(CLK_LOCAL_MEM_FENCE); if (x == x0) { calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); @@ -246,12 +232,10 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int if (y == y0) calcElementInFirstRow(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); else - { calcElement(src, src_step, src_offset, dists, y, x - x0, x, id, first, col_dists, up_col_dists); - first = (first + 1) % TEMPLATE_SIZE; - } - } + first = (first + 1) % TEMPLATE_SIZE; + } barrier(CLK_LOCAL_MEM_FENCE); convolveWindow(src, src_step, src_offset, dists, almostDist2Weight, dst, dst_step, dst_offset, diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp index 0554b3067..721cdfc55 100644 --- a/modules/photo/test/ocl/test_denoising.cpp +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -71,7 +71,7 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool) { const int type = CV_8UC(cn); - Size roiSize = randomSize(1, MAX_VALUE); + Size roiSize = randomSize(10, MAX_VALUE); Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255); @@ -94,6 +94,10 @@ OCL_TEST_P(FastNlMeansDenoising, Mat) OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize)); OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize)); +// Mat difference; +// cv::subtract(dst_roi, udst_roi, difference); +// print(difference); + OCL_EXPECT_MATS_NEAR(dst, 1) } } From 72c327fef882421d6403b00e66397b29efeae801 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 18 Feb 2014 19:23:38 +0400 Subject: [PATCH 06/11] hope it finally works --- .../src/fast_nlmeans_denoising_opencl.hpp | 19 +++++++++++------ modules/photo/src/opencl/nlmeans.cl | 21 +++++++++++++++---- modules/photo/test/ocl/test_denoising.cpp | 4 ---- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index cd28c1489..eab98998f 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -35,28 +35,35 @@ static int divUp(int a, int b) return (a + b - 1) / b; } -static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, float h, int cn, +template +static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindowSize, int templateWindowSize, FT h, int cn, int & almostTemplateWindowSizeSqBinShift) { const int maxEstimateSumValue = searchWindowSize * searchWindowSize * 255; int fixedPointMult = std::numeric_limits::max() / maxEstimateSumValue; + int depth = DataType::depth; + bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; + + if (depth == CV_64F && !doubleSupport) + return false; // precalc weight for every possible l2 dist between blocks // additional optimization of precalced weights to replace division(averaging) by binary shift CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX) int templateWindowSizeSq = templateWindowSize * templateWindowSize; almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq); - float almostDist2ActualDistMultiplier = (float)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; + FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; - const float WEIGHT_THRESHOLD = 1e-3f; + const FT WEIGHT_THRESHOLD = 1e-3f; int maxDist = 255 * 255 * cn; int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1); - float den = 1.0f / (h * h * cn); + FT den = 1.0f / (h * h * cn); almostDist2Weight.create(1, almostMaxDist, CV_32SC1); ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc, - "-D OP_CALC_WEIGHTS"); + format("-D OP_CALC_WEIGHTS -D FT=%s%s", ocl::typeToStr(depth), + doubleSupport ? " -D DOUBLE_SUPPORT" : "")); if (k.empty()) return false; @@ -99,7 +106,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, return false; UMat almostDist2Weight; - if (!ocl_calcAlmostDist2Weight(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn, + if (!ocl_calcAlmostDist2Weight(almostDist2Weight, searchWindowSize, templateWindowSize, h, cn, almostTemplateWindowSizeSqBinShift)) return false; CV_Assert(almostTemplateWindowSizeSqBinShift >= 0); diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index f038e6ce1..ae1c3090b 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -5,25 +5,38 @@ // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. +#ifdef cl_amd_printf #pragma OPENCL_EXTENSION cl_amd_printf:enable +#endif + +#ifdef DOUBLE_SUPPORT +#ifdef cl_amd_fp64 +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#elif defined cl_khr_fp64 +#pragma OPENCL EXTENSION cl_khr_fp64:enable +#endif +#endif + #ifdef OP_CALC_WEIGHTS __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almostMaxDist, - float almostDist2ActualDistMultiplier, int fixedPointMult, - float den, float WEIGHT_THRESHOLD) + FT almostDist2ActualDistMultiplier, int fixedPointMult, + FT den, FT WEIGHT_THRESHOLD) { int almostDist = get_global_id(0); if (almostDist < almostMaxDist) { - float dist = almostDist * almostDist2ActualDistMultiplier; + FT dist = almostDist * almostDist2ActualDistMultiplier; int weight = convert_int_sat_rte(fixedPointMult * exp(-dist * den)); if (weight < WEIGHT_THRESHOLD * fixedPointMult) weight = 0; almostDist2Weight[almostDist] = weight; + +// printf("%d ", weight); } } @@ -193,7 +206,7 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off if (id == 0) { int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset)); - *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local[0]); + *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local[0] / weights_local[0]); } } diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp index 721cdfc55..9f504d0a0 100644 --- a/modules/photo/test/ocl/test_denoising.cpp +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -94,10 +94,6 @@ OCL_TEST_P(FastNlMeansDenoising, Mat) OCL_OFF(cv::fastNlMeansDenoising(src_roi, dst_roi, h, templateWindowSize, searchWindowSize)); OCL_ON(cv::fastNlMeansDenoising(usrc_roi, udst_roi, h, templateWindowSize, searchWindowSize)); -// Mat difference; -// cv::subtract(dst_roi, udst_roi, difference); -// print(difference); - OCL_EXPECT_MATS_NEAR(dst, 1) } } From 7faf1f6277cf780b78c591c38fb569cf43236dcd Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 18 Feb 2014 19:38:50 +0400 Subject: [PATCH 07/11] some improvements --- modules/photo/perf/opencl/perf_denoising.cpp | 2 +- .../src/fast_nlmeans_denoising_opencl.hpp | 2 +- modules/photo/src/opencl/nlmeans.cl | 30 +++++++++++++------ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/modules/photo/perf/opencl/perf_denoising.cpp b/modules/photo/perf/opencl/perf_denoising.cpp index 0bdf08363..66903457d 100644 --- a/modules/photo/perf/opencl/perf_denoising.cpp +++ b/modules/photo/perf/opencl/perf_denoising.cpp @@ -26,7 +26,7 @@ OCL_PERF_TEST(Photo, DenoisingGrayscale) OCL_TEST_CYCLE() cv::fastNlMeansDenoising(original, result, 10); - SANITY_CHECK(result); + SANITY_CHECK(result, 1); } OCL_PERF_TEST(Photo, DenoisingColored) diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index eab98998f..3a7037aea 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -19,7 +19,7 @@ enum { BLOCK_ROWS = 32, BLOCK_COLS = 128, - CTA_SIZE = 128 + CTA_SIZE = 256 }; static inline int getNearestPowerOf2(int value) diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index ae1c3090b..583d42e96 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -35,8 +35,6 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost weight = 0; almostDist2Weight[almostDist] = weight; - -// printf("%d ", weight); } } @@ -50,12 +48,20 @@ inline int_t calcDist(uchar_t a, uchar_t b) return diff * diff; } +inline int_t calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t) +{ + int_t A = convert_int_t(down_value) - convert_int_t(down_value_t); + int_t B = convert_int_t(up_value) - convert_int_t(up_value_t); + return (A - B) * (A + B); +} + inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset, __local int_t * dists, int y, int x, int id, __global int_t * col_dists, __global int_t * up_col_dists) { y -= TEMPLATE_SIZE2; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; + int_t col_dists_current_private[TEMPLATE_SIZE]; for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { @@ -68,9 +74,8 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int #pragma unroll for (int j = 0; j < TEMPLATE_SIZE; ++j) - col_dists_current[j] = (int_t)(0); + col_dists_current_private[j] = (int_t)(0); - #pragma unroll for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) { #pragma unroll @@ -78,7 +83,7 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int { value = calcDist(src_template[tx], src_current[tx]); - col_dists_current[tx + TEMPLATE_SIZE2] += value; + col_dists_current_private[tx + TEMPLATE_SIZE2] += value; dist += value; } @@ -86,6 +91,10 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_template = (__global const uchar_t *)((__global const uchar *)src_template + src_step); } + #pragma unroll + for (int j = 0; j < TEMPLATE_SIZE; ++j) + col_dists_current[j] = col_dists_current_private[j]; + dists[i] = dist; up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1]; } @@ -148,7 +157,7 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset __global int_t * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); __global int_t * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); - int_t col_dist = up_col_dists_current[0] + calcDist(down_value, down_value_t) - calcDist(up_value, up_value_t); + int_t col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t); dists[i] += col_dist - col_dists_current[0]; col_dists_current[0] = col_dist; @@ -192,7 +201,7 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off } barrier(CLK_LOCAL_MEM_FENCE); - for (int lsize = CTA_SIZE2 >> 1; lsize > 0; lsize >>= 1) + for (int lsize = CTA_SIZE2 >> 1; lsize > 2; lsize >>= 1) { if (id < lsize) { @@ -206,7 +215,11 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off if (id == 0) { int dst_index = mad24(y, dst_step, mad24(cn, x, dst_offset)); - *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local[0] / weights_local[0]); + int_t weighted_sum_local_0 = weighted_sum_local[0] + weighted_sum_local[1] + + weighted_sum_local[2] + weighted_sum_local[3]; + int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; + + *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / weights_local_0); } } @@ -234,7 +247,6 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int for (int y = y0; y < y1; ++y) for (int x = x0; x < x1; ++x) { - // barrier(CLK_LOCAL_MEM_FENCE); if (x == x0) { calcFirstElementInRow(src, src_step, src_offset, dists, y, x, id, col_dists, up_col_dists); From 634e8d37cc208f5f0f54891d7dc7d4f991769e7f Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 19 Feb 2014 18:13:09 +0400 Subject: [PATCH 08/11] fixed for 2 channels --- modules/photo/perf/opencl/perf_denoising.cpp | 4 +- ...fast_nlmeans_denoising_invoker_commons.hpp | 35 +++++++++- modules/photo/src/opencl/nlmeans.cl | 70 ++++++++++++------- modules/photo/test/ocl/test_denoising.cpp | 4 +- 4 files changed, 84 insertions(+), 29 deletions(-) diff --git a/modules/photo/perf/opencl/perf_denoising.cpp b/modules/photo/perf/opencl/perf_denoising.cpp index 66903457d..0a0a2d6b5 100644 --- a/modules/photo/perf/opencl/perf_denoising.cpp +++ b/modules/photo/perf/opencl/perf_denoising.cpp @@ -45,7 +45,7 @@ OCL_PERF_TEST(Photo, DenoisingColored) SANITY_CHECK(result); } -OCL_PERF_TEST(Photo, DenoisingGrayscaleMulti) +OCL_PERF_TEST(Photo, DESABLED_DenoisingGrayscaleMulti) { const int imgs_count = 3; @@ -68,7 +68,7 @@ OCL_PERF_TEST(Photo, DenoisingGrayscaleMulti) SANITY_CHECK(result); } -OCL_PERF_TEST(Photo, DenoisingColoredMulti) +OCL_PERF_TEST(Photo, DESABLED_DenoisingColoredMulti) { const int imgs_count = 3; diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp index 7a2650077..ab7db5d2d 100644 --- a/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_invoker_commons.hpp @@ -84,7 +84,6 @@ template static inline void incWithWeight(int* estimation, int weig template <> inline void incWithWeight(int* estimation, int weight, uchar p) { - estimation[0] += weight * p; } @@ -101,6 +100,24 @@ template <> inline void incWithWeight(int* estimation, int weight, Vec3b p) estimation[2] += weight * p[2]; } +template <> inline void incWithWeight(int* estimation, int weight, int p) +{ + estimation[0] += weight * p; +} + +template <> inline void incWithWeight(int* estimation, int weight, Vec2i p) +{ + estimation[0] += weight * p[0]; + estimation[1] += weight * p[1]; +} + +template <> inline void incWithWeight(int* estimation, int weight, Vec3i p) +{ + estimation[0] += weight * p[0]; + estimation[1] += weight * p[1]; + estimation[2] += weight * p[2]; +} + template static inline T saturateCastFromArray(int* estimation); template <> inline uchar saturateCastFromArray(int* estimation) @@ -125,4 +142,20 @@ template <> inline Vec3b saturateCastFromArray(int* estimation) return res; } +template <> inline int saturateCastFromArray(int* estimation) +{ + return estimation[0]; +} + +template <> inline Vec2i saturateCastFromArray(int* estimation) +{ + estimation[1] = 0; + return Vec2i(estimation); +} + +template <> inline Vec3i saturateCastFromArray(int* estimation) +{ + return Vec3i(estimation); +} + #endif diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index 583d42e96..7b13ea5e1 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -40,41 +40,61 @@ __kernel void calcAlmostDist2Weight(__global int * almostDist2Weight, int almost #elif defined OP_CALC_FASTNLMEANS +#define noconvert + #define SEARCH_SIZE_SQ (SEARCH_SIZE * SEARCH_SIZE) -inline int_t calcDist(uchar_t a, uchar_t b) +inline int calcDist(uchar_t a, uchar_t b) { int_t diff = convert_int_t(a) - convert_int_t(b); - return diff * diff; + int_t retval = diff * diff; + +#if cn == 1 + return retval; +#elif cn == 2 + return retval.x + retval.y; +#else +#error "cn should be either 1 or 2" +#endif } -inline int_t calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t) +inline int calcDistUpDown(uchar_t down_value, uchar_t down_value_t, uchar_t up_value, uchar_t up_value_t) { int_t A = convert_int_t(down_value) - convert_int_t(down_value_t); int_t B = convert_int_t(up_value) - convert_int_t(up_value_t); - return (A - B) * (A + B); + int_t retval = (A - B) * (A + B); + +#if cn == 1 + return retval; +#elif cn == 2 + return retval.x + retval.y; +#else +#error "cn should be either 1 or 2" +#endif } +#define COND if (x == 0 && y == 0) + inline void calcFirstElementInRow(__global const uchar * src, int src_step, int src_offset, - __local int_t * dists, int y, int x, int id, - __global int_t * col_dists, __global int_t * up_col_dists) + __local int * dists, int y, int x, int id, + __global int * col_dists, __global int * up_col_dists) { y -= TEMPLATE_SIZE2; int sx = x - SEARCH_SIZE2, sy = y - SEARCH_SIZE2; - int_t col_dists_current_private[TEMPLATE_SIZE]; + int col_dists_current_private[TEMPLATE_SIZE]; for (int i = id, size = SEARCH_SIZE_SQ; i < size; i += CTA_SIZE) { - int_t dist = (int_t)(0), value; + int dist = 0, value; __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); - __global int_t * col_dists_current = col_dists + i * TEMPLATE_SIZE; + __global int * col_dists_current = col_dists + i * TEMPLATE_SIZE; #pragma unroll for (int j = 0; j < TEMPLATE_SIZE; ++j) - col_dists_current_private[j] = (int_t)(0); + col_dists_current_private[j] = 0; for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) { @@ -95,14 +115,16 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int for (int j = 0; j < TEMPLATE_SIZE; ++j) col_dists_current[j] = col_dists_current_private[j]; +// COND printf("%d %d\n", i, convert_int(dist)); + dists[i] = dist; up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1]; } } inline void calcElementInFirstRow(__global const uchar * src, int src_step, int src_offset, - __local int_t * dists, int y, int x0, int x, int id, int first, - __global int_t * col_dists, __global int_t * up_col_dists) + __local int * dists, int y, int x0, int x, int id, int first, + __global int * col_dists, __global int * up_col_dists) { x += TEMPLATE_SIZE2; y -= TEMPLATE_SIZE2; @@ -113,9 +135,9 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int __global const uchar_t * src_current = (__global const uchar_t *)(src + mad24(y, src_step, mad24(cn, x, src_offset))); __global const uchar_t * src_template = (__global const uchar_t *)(src + mad24(sy + i / SEARCH_SIZE, src_step, mad24(cn, sx + i % SEARCH_SIZE, src_offset))); - __global int_t * col_dists_current = col_dists + TEMPLATE_SIZE * i; + __global int * col_dists_current = col_dists + TEMPLATE_SIZE * i; - int_t col_dist = (int_t)(0); + int col_dist = 0; #pragma unroll for (int ty = 0; ty < TEMPLATE_SIZE; ++ty) @@ -133,8 +155,8 @@ inline void calcElementInFirstRow(__global const uchar * src, int src_step, int } inline void calcElement(__global const uchar * src, int src_step, int src_offset, - __local int_t * dists, int y, int x0, int x, int id, int first, - __global int_t * col_dists, __global int_t * up_col_dists) + __local int * dists, int y, int x0, int x, int id, int first, + __global int * col_dists, __global int * up_col_dists) { int sx = x + TEMPLATE_SIZE2; int sy_up = y - TEMPLATE_SIZE2 - 1; @@ -154,10 +176,10 @@ inline void calcElement(__global const uchar * src, int src_step, int src_offset uchar_t up_value_t = *(__global const uchar_t *)(src + mad24(sy_up + wy, src_step, mad24(cn, sx + wx, src_offset))); uchar_t down_value_t = *(__global const uchar_t *)(src + mad24(sy_down + wy, src_step, mad24(cn, sx + wx, src_offset))); - __global int_t * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); - __global int_t * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); + __global int * col_dists_current = col_dists + mad24(i, TEMPLATE_SIZE, first); + __global int * up_col_dists_current = up_col_dists + mad24(x0, SEARCH_SIZE_SQ, i); - int_t col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t); + int col_dist = up_col_dists_current[0] + calcDistUpDown(down_value, down_value_t, up_value, up_value_t); dists[i] += col_dist - col_dists_current[0]; col_dists_current[0] = col_dist; @@ -219,7 +241,7 @@ inline void convolveWindow(__global const uchar * src, int src_step, int src_off weighted_sum_local[2] + weighted_sum_local[3]; int weights_local_0 = weights_local[0] + weights_local[1] + weights_local[2] + weights_local[3]; - *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / weights_local_0); + *(__global uchar_t *)(dst + dst_index) = convert_uchar_t(weighted_sum_local_0 / (int_t)(weights_local_0)); } } @@ -232,8 +254,8 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int int block_y = get_group_id(1); int id = get_local_id(0), first; - __local int_t dists[SEARCH_SIZE_SQ], weighted_sum[CTA_SIZE2]; - __local int weights[CTA_SIZE2]; + __local int dists[SEARCH_SIZE_SQ], weights[CTA_SIZE2]; + __local int_t weighted_sum[CTA_SIZE2]; int x0 = block_x * BLOCK_COLS, x1 = min(x0 + BLOCK_COLS, dst_cols); int y0 = block_y * BLOCK_ROWS, y1 = min(y0 + BLOCK_ROWS, dst_rows); @@ -241,8 +263,8 @@ __kernel void fastNlMeansDenoising(__global const uchar * src, int src_step, int // for each group we need SEARCH_SIZE_SQ * TEMPLATE_SIZE integer buffer for storing part column sum for current element // and SEARCH_SIZE_SQ * BLOCK_COLS integer buffer for storing last column sum for each element of search window of up row int block_data_start = SEARCH_SIZE_SQ * (mad24(block_y, dst_cols, x0) + mad24(block_y, nblocks_x, block_x) * TEMPLATE_SIZE); - __global int_t * col_dists = (__global int_t *)(buffer + block_data_start * sizeof(int_t)); - __global int_t * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE; + __global int * col_dists = (__global int *)(buffer + block_data_start * sizeof(int)); + __global int * up_col_dists = col_dists + SEARCH_SIZE_SQ * TEMPLATE_SIZE; for (int y = y0; y < y1; ++y) for (int x = x0; x < x1; ++x) diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp index 9f504d0a0..6630b0291 100644 --- a/modules/photo/test/ocl/test_denoising.cpp +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -71,7 +71,7 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool) { const int type = CV_8UC(cn); - Size roiSize = randomSize(10, MAX_VALUE); + Size roiSize = randomSize(1, MAX_VALUE); Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255); @@ -98,7 +98,7 @@ OCL_TEST_P(FastNlMeansDenoising, Mat) } } -OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values((Channels)1), Bool())); +OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool())); } } // namespace cvtest::ocl From 8bf616e4c78d5e1133c634b4059f25a162f5ab36 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 19 Feb 2014 18:34:34 +0400 Subject: [PATCH 09/11] added color image support via L*a*b* --- modules/photo/perf/opencl/perf_denoising.cpp | 4 +-- modules/photo/src/denoising.cpp | 15 ++++++--- .../src/fast_nlmeans_denoising_opencl.hpp | 32 +++++++++++++++++++ modules/photo/test/ocl/test_denoising.cpp | 16 ++++++++++ 4 files changed, 60 insertions(+), 7 deletions(-) diff --git a/modules/photo/perf/opencl/perf_denoising.cpp b/modules/photo/perf/opencl/perf_denoising.cpp index 0a0a2d6b5..cd0d3a242 100644 --- a/modules/photo/perf/opencl/perf_denoising.cpp +++ b/modules/photo/perf/opencl/perf_denoising.cpp @@ -45,7 +45,7 @@ OCL_PERF_TEST(Photo, DenoisingColored) SANITY_CHECK(result); } -OCL_PERF_TEST(Photo, DESABLED_DenoisingGrayscaleMulti) +OCL_PERF_TEST(Photo, DISABLED_DenoisingGrayscaleMulti) { const int imgs_count = 3; @@ -68,7 +68,7 @@ OCL_PERF_TEST(Photo, DESABLED_DenoisingGrayscaleMulti) SANITY_CHECK(result); } -OCL_PERF_TEST(Photo, DESABLED_DenoisingColoredMulti) +OCL_PERF_TEST(Photo, DISABLED_DenoisingColoredMulti) { const int imgs_count = 3; diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp index a3aa0f41f..c9670f452 100644 --- a/modules/photo/src/denoising.cpp +++ b/modules/photo/src/denoising.cpp @@ -86,15 +86,20 @@ void cv::fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, float h, float hForColorComponents, int templateWindowSize, int searchWindowSize) { - Mat src = _src.getMat(); - _dst.create(src.size(), src.type()); - Mat dst = _dst.getMat(); - - if (src.type() != CV_8UC3) { + if (_src.type() != CV_8UC3) + { CV_Error(Error::StsBadArg, "Type of input image should be CV_8UC3!"); return; } + CV_OCL_RUN(_src.dims() <= 2 && (_dst.isUMat() || _src.isUMat()), + ocl_fastNlMeansDenoisingColored(_src, _dst, h, hForColorComponents, + templateWindowSize, searchWindowSize)) + + Mat src = _src.getMat(); + _dst.create(src.size(), src.type()); + Mat dst = _dst.getMat(); + Mat src_lab; cvtColor(src, src_lab, COLOR_LBGR2Lab); diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index 3a7037aea..9e4cd0609 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -132,6 +132,38 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, return k.run(2, globalsize, localsize, false); } +static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, + float h, float hForColorComponents, + int templateWindowSize, int searchWindowSize) +{ + UMat src = _src.getUMat(); + _dst.create(src.size(), src.type()); + UMat dst = _dst.getUMat(); + + UMat src_lab; + cvtColor(src, src_lab, COLOR_LBGR2Lab); + + UMat l(src.size(), CV_8U); + UMat ab(src.size(), CV_8UC2); + std::vector l_ab(2), l_ab_denoised(2); + l_ab[0] = l; + l_ab[1] = ab; + l_ab_denoised[0].create(src.size(), CV_8U); + l_ab_denoised[1].create(src.size(), CV_8UC2); + + int from_to[] = { 0,0, 1,1, 2,2 }; + mixChannels(std::vector(1, src_lab), l_ab, from_to, 3); + + fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize); + fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize); + + UMat dst_lab(src.size(), src.type()); + mixChannels(l_ab_denoised, std::vector(1, dst_lab), from_to, 3); + + cvtColor(dst_lab, dst, COLOR_Lab2LBGR); + return true; +} + } #endif diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp index 6630b0291..51de63462 100644 --- a/modules/photo/test/ocl/test_denoising.cpp +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -98,7 +98,23 @@ OCL_TEST_P(FastNlMeansDenoising, Mat) } } +typedef FastNlMeansDenoisingTestBase fastNlMeansDenoisingColored; + +OCL_TEST_P(fastNlMeansDenoisingColored, Mat) +{ + for (int j = 0; j < test_loop_times; j++) + { + generateTestData(); + + OCL_OFF(cv::fastNlMeansDenoisingColored(src_roi, dst_roi, h, h, templateWindowSize, searchWindowSize)); + OCL_ON(cv::fastNlMeansDenoisingColored(usrc_roi, udst_roi, h, h, templateWindowSize, searchWindowSize)); + + OCL_EXPECT_MATS_NEAR(dst, 1) + } +} + OCL_INSTANTIATE_TEST_CASE_P(Photo, FastNlMeansDenoising, Combine(Values(1, 2), Bool())); +OCL_INSTANTIATE_TEST_CASE_P(Photo, fastNlMeansDenoisingColored, Combine(Values(Channels(3)), Bool())); } } // namespace cvtest::ocl From c7857e8c1341426f9a16c55ac0923adec5a996d0 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 20 Feb 2014 16:05:50 +0400 Subject: [PATCH 10/11] small fixes --- modules/photo/perf/opencl/perf_denoising.cpp | 2 +- .../src/fast_nlmeans_denoising_opencl.hpp | 13 +++--- modules/photo/src/opencl/nlmeans.cl | 2 - modules/photo/test/ocl/test_denoising.cpp | 45 +++---------------- 4 files changed, 13 insertions(+), 49 deletions(-) diff --git a/modules/photo/perf/opencl/perf_denoising.cpp b/modules/photo/perf/opencl/perf_denoising.cpp index cd0d3a242..a2ee9178a 100644 --- a/modules/photo/perf/opencl/perf_denoising.cpp +++ b/modules/photo/perf/opencl/perf_denoising.cpp @@ -42,7 +42,7 @@ OCL_PERF_TEST(Photo, DenoisingColored) OCL_TEST_CYCLE() cv::fastNlMeansDenoisingColored(original, result, 10, 10); - SANITY_CHECK(result); + SANITY_CHECK(result, 2); } OCL_PERF_TEST(Photo, DISABLED_DenoisingGrayscaleMulti) diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index 9e4cd0609..a12132167 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -9,20 +9,20 @@ #define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__ #include "precomp.hpp" - -#define CV_OPENCL_RUN_ASSERT #include "opencl_kernels.hpp" +#ifdef HAVE_OPENCL + namespace cv { enum { BLOCK_ROWS = 32, - BLOCK_COLS = 128, + BLOCK_COLS = 32, CTA_SIZE = 256 }; -static inline int getNearestPowerOf2(int value) +static inline int getNearestPowerOf2OpenCL(int value) { int p = 0; while (1 << p < value) @@ -51,7 +51,7 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow // additional optimization of precalced weights to replace division(averaging) by binary shift CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX) int templateWindowSizeSq = templateWindowSize * templateWindowSize; - almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq); + almostTemplateWindowSizeSqBinShift = getNearestPowerOf2OpenCL(templateWindowSizeSq); FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; const FT WEIGHT_THRESHOLD = 1e-3f; @@ -128,7 +128,7 @@ static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, ocl::KernelArg::PtrReadOnly(almostDist2Weight), ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift); - size_t globalsize[2] = { nblocksx * BLOCK_COLS, nblocksy * BLOCK_ROWS }, localsize[2] = { CTA_SIZE, 1 }; + size_t globalsize[2] = { nblocksx * CTA_SIZE, nblocksy }, localsize[2] = { CTA_SIZE, 1 }; return k.run(2, globalsize, localsize, false); } @@ -167,3 +167,4 @@ static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst, } #endif +#endif diff --git a/modules/photo/src/opencl/nlmeans.cl b/modules/photo/src/opencl/nlmeans.cl index 7b13ea5e1..67a6ec61f 100644 --- a/modules/photo/src/opencl/nlmeans.cl +++ b/modules/photo/src/opencl/nlmeans.cl @@ -115,8 +115,6 @@ inline void calcFirstElementInRow(__global const uchar * src, int src_step, int for (int j = 0; j < TEMPLATE_SIZE; ++j) col_dists_current[j] = col_dists_current_private[j]; -// COND printf("%d %d\n", i, convert_int(dist)); - dists[i] = dist; up_col_dists[0 + i] = col_dists[TEMPLATE_SIZE - 1]; } diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp index 51de63462..819c65daa 100644 --- a/modules/photo/test/ocl/test_denoising.cpp +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -1,44 +1,9 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +// Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved. // Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ #include "test_precomp.hpp" #include "opencv2/ts/ocl_test.hpp" From 9b31e6cd4478d245d493bbff2e2ea373fee524f7 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Tue, 11 Mar 2014 15:21:14 +0400 Subject: [PATCH 11/11] changes according to reviewer's suggestions --- .../photo/src/fast_nlmeans_denoising_opencl.hpp | 14 +++----------- modules/photo/test/ocl/test_denoising.cpp | 11 ++++++++++- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp index a12132167..ad1d94254 100644 --- a/modules/photo/src/fast_nlmeans_denoising_opencl.hpp +++ b/modules/photo/src/fast_nlmeans_denoising_opencl.hpp @@ -22,14 +22,6 @@ enum CTA_SIZE = 256 }; -static inline int getNearestPowerOf2OpenCL(int value) -{ - int p = 0; - while (1 << p < value) - ++p; - return p; -} - static int divUp(int a, int b) { return (a + b - 1) / b; @@ -51,7 +43,7 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow // additional optimization of precalced weights to replace division(averaging) by binary shift CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX) int templateWindowSizeSq = templateWindowSize * templateWindowSize; - almostTemplateWindowSizeSqBinShift = getNearestPowerOf2OpenCL(templateWindowSizeSq); + almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq); FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq; const FT WEIGHT_THRESHOLD = 1e-3f; @@ -77,10 +69,10 @@ static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight, int searchWindow static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, float h, int templateWindowSize, int searchWindowSize) { - int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); + int type = _src.type(), cn = CV_MAT_CN(type); Size size = _src.size(); - if ( !(depth == CV_8U && cn <= 4 && cn != 3) ) + if ( type != CV_8UC1 || type != CV_8UC2 || type != CV_8UC4 ) return false; int templateWindowHalfWize = templateWindowSize / 2; diff --git a/modules/photo/test/ocl/test_denoising.cpp b/modules/photo/test/ocl/test_denoising.cpp index 819c65daa..745a45745 100644 --- a/modules/photo/test/ocl/test_denoising.cpp +++ b/modules/photo/test/ocl/test_denoising.cpp @@ -34,11 +34,20 @@ PARAM_TEST_CASE(FastNlMeansDenoisingTestBase, Channels, bool) virtual void generateTestData() { + Mat image; + if (cn == 1) + { + image = readImage("denoising/lena_noised_gaussian_sigma=10.png", IMREAD_GRAYSCALE); + ASSERT_FALSE(image.empty()); + } + const int type = CV_8UC(cn); - Size roiSize = randomSize(1, MAX_VALUE); + Size roiSize = cn == 1 ? image.size() : randomSize(1, MAX_VALUE); Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(src, src_roi, roiSize, srcBorder, type, 0, 255); + if (cn == 1) + image.copyTo(src_roi); Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0); randomSubMat(dst, dst_roi, roiSize, dstBorder, type, 0, 255);