significantly improved parallel non-local means by using granularity parameter in parallel_for_ loop. Because the algorithm deals with sliding sums, it's essential that each thread has enough work to do, otherwise the algorithm gets higher theoretical complexity and thus there is no speedup comparing to 1-thread code (at best).

This commit is contained in:
Vadim Pisarevsky 2015-05-14 15:39:42 +03:00
parent feb5b6aa93
commit b37aaa8303
2 changed files with 41 additions and 14 deletions

View File

@ -50,42 +50,50 @@ static void fastNlMeansDenoising_( const Mat& src, Mat& dst, const std::vector<f
int templateWindowSize, int searchWindowSize)
{
int hn = (int)h.size();
double granularity = (double)std::max(1., (double)dst.total()/(1 << 17));
switch (CV_MAT_CN(src.type())) {
case 1:
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<ST, IT, UIT, D, int>(
src, dst, templateWindowSize, searchWindowSize, &h[0]));
src, dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
case 2:
if (hn == 1)
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
src, dst, templateWindowSize, searchWindowSize, &h[0]));
src, dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
else
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
src, dst, templateWindowSize, searchWindowSize, &h[0]));
src, dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
case 3:
if (hn == 1)
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
src, dst, templateWindowSize, searchWindowSize, &h[0]));
src, dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
else
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
src, dst, templateWindowSize, searchWindowSize, &h[0]));
src, dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
case 4:
if (hn == 1)
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
src, dst, templateWindowSize, searchWindowSize, &h[0]));
src, dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
else
parallel_for_(cv::Range(0, src.rows),
FastNlMeansDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
src, dst, templateWindowSize, searchWindowSize, &h[0]));
src, dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
default:
CV_Error(Error::StsBadArg,
@ -237,6 +245,7 @@ static void fastNlMeansDenoisingMulti_( const std::vector<Mat>& srcImgs, Mat& ds
int templateWindowSize, int searchWindowSize)
{
int hn = (int)h.size();
double granularity = (double)std::max(1., (double)dst.total()/(1 << 16));
switch (srcImgs[0].type())
{
@ -244,43 +253,50 @@ static void fastNlMeansDenoisingMulti_( const std::vector<Mat>& srcImgs, Mat& ds
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<uchar, IT, UIT, D, int>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, &h[0]));
dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
case CV_8UC2:
if (hn == 1)
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, int>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, &h[0]));
dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
else
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<Vec<ST, 2>, IT, UIT, D, Vec2i>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, &h[0]));
dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
case CV_8UC3:
if (hn == 1)
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, int>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, &h[0]));
dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
else
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<Vec<ST, 3>, IT, UIT, D, Vec3i>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, &h[0]));
dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
case CV_8UC4:
if (hn == 1)
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, int>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, &h[0]));
dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
else
parallel_for_(cv::Range(0, srcImgs[0].rows),
FastNlMeansMultiDenoisingInvoker<Vec<ST, 4>, IT, UIT, D, Vec4i>(
srcImgs, imgToDenoiseIndex, temporalWindowSize,
dst, templateWindowSize, searchWindowSize, &h[0]));
dst, templateWindowSize, searchWindowSize, &h[0]),
granularity);
break;
default:
CV_Error(Error::StsBadArg,

View File

@ -156,3 +156,14 @@ TEST(Photo_White, issue_2646)
ASSERT_EQ(0, nonWhitePixelsCount);
}
TEST(Photo_Denoising, speed)
{
string imgname = string(cvtest::TS::ptr()->get_data_path()) + "shared/5MP.png";
Mat src = imread(imgname, 0), dst;
double t = (double)getTickCount();
fastNlMeansDenoising(src, dst, 5, 7, 21);
t = (double)getTickCount() - t;
printf("execution time: %gms\n", t*1000./getTickFrequency());
}