Replaced most of the usages of parallel_for with that of parallel_for_.

This should allow many algorithms to take advantage of more parallelization
technologies.
This commit is contained in:
Roman Donchenko
2013-05-30 18:44:33 +04:00
parent 37091b086c
commit 29b13ec1de
24 changed files with 232 additions and 352 deletions

View File

@@ -2755,7 +2755,7 @@ const int ITUR_BT_601_CGV = -385875;
const int ITUR_BT_601_CBV = -74448;
template<int bIdx, int uIdx>
struct YUV420sp2RGB888Invoker
struct YUV420sp2RGB888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *muv;
@@ -2764,10 +2764,10 @@ struct YUV420sp2RGB888Invoker
YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
: dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin() * 2;
int rangeEnd = range.end() * 2;
int rangeBegin = range.start * 2;
int rangeEnd = range.end * 2;
//R = 1.164(Y - 16) + 1.596(V - 128)
//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -2824,7 +2824,7 @@ struct YUV420sp2RGB888Invoker
};
template<int bIdx, int uIdx>
struct YUV420sp2RGBA8888Invoker
struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *muv;
@@ -2833,10 +2833,10 @@ struct YUV420sp2RGBA8888Invoker
YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
: dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin() * 2;
int rangeEnd = range.end() * 2;
int rangeBegin = range.start * 2;
int rangeEnd = range.end * 2;
//R = 1.164(Y - 16) + 1.596(V - 128)
//G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -2897,7 +2897,7 @@ struct YUV420sp2RGBA8888Invoker
};
template<int bIdx>
struct YUV420p2RGB888Invoker
struct YUV420p2RGB888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *mu, *mv;
@@ -2907,19 +2907,19 @@ struct YUV420p2RGB888Invoker
YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
: dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
const int rangeBegin = range.begin() * 2;
const int rangeEnd = range.end() * 2;
const int rangeBegin = range.start * 2;
const int rangeEnd = range.end * 2;
size_t uvsteps[2] = {width/2, stride - width/2};
int usIdx = ustepIdx, vsIdx = vstepIdx;
const uchar* y1 = my1 + rangeBegin * stride;
const uchar* u1 = mu + (range.begin() / 2) * stride;
const uchar* v1 = mv + (range.begin() / 2) * stride;
const uchar* u1 = mu + (range.start / 2) * stride;
const uchar* v1 = mv + (range.start / 2) * stride;
if(range.begin() % 2 == 1)
if(range.start % 2 == 1)
{
u1 += uvsteps[(usIdx++) & 1];
v1 += uvsteps[(vsIdx++) & 1];
@@ -2965,7 +2965,7 @@ struct YUV420p2RGB888Invoker
};
template<int bIdx>
struct YUV420p2RGBA8888Invoker
struct YUV420p2RGBA8888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* my1, *mu, *mv;
@@ -2975,19 +2975,19 @@ struct YUV420p2RGBA8888Invoker
YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
: dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin() * 2;
int rangeEnd = range.end() * 2;
int rangeBegin = range.start * 2;
int rangeEnd = range.end * 2;
size_t uvsteps[2] = {width/2, stride - width/2};
int usIdx = ustepIdx, vsIdx = vstepIdx;
const uchar* y1 = my1 + rangeBegin * stride;
const uchar* u1 = mu + (range.begin() / 2) * stride;
const uchar* v1 = mv + (range.begin() / 2) * stride;
const uchar* u1 = mu + (range.start / 2) * stride;
const uchar* v1 = mv + (range.start / 2) * stride;
if(range.begin() % 2 == 1)
if(range.start % 2 == 1)
{
u1 += uvsteps[(usIdx++) & 1];
v1 += uvsteps[(vsIdx++) & 1];
@@ -3042,48 +3042,40 @@ template<int bIdx, int uIdx>
inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
{
YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1, _uv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
template<int bIdx, int uIdx>
inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
{
YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1, _uv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
template<int bIdx>
inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
{
YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1, _u, _v, ustepIdx, vstepIdx);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
template<int bIdx>
inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
{
YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1, _u, _v, ustepIdx, vstepIdx);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows/2), converter);
parallel_for_(Range(0, _dst.rows/2), converter);
else
#endif
converter(BlockedRange(0, _dst.rows/2));
converter(Range(0, _dst.rows/2));
}
///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
@@ -3167,7 +3159,7 @@ static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
template<int bIdx, int uIdx, int yIdx>
struct YUV422toRGB888Invoker
struct YUV422toRGB888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* src;
@@ -3176,10 +3168,10 @@ struct YUV422toRGB888Invoker
YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
: dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin();
int rangeEnd = range.end();
int rangeBegin = range.start;
int rangeEnd = range.end;
const int uidx = 1 - yIdx + uIdx * 2;
const int vidx = (2 + uidx) % 4;
@@ -3213,7 +3205,7 @@ struct YUV422toRGB888Invoker
};
template<int bIdx, int uIdx, int yIdx>
struct YUV422toRGBA8888Invoker
struct YUV422toRGBA8888Invoker : ParallelLoopBody
{
Mat* dst;
const uchar* src;
@@ -3222,10 +3214,10 @@ struct YUV422toRGBA8888Invoker
YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
: dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
void operator()(const BlockedRange& range) const
void operator()(const Range& range) const
{
int rangeBegin = range.begin();
int rangeEnd = range.end();
int rangeBegin = range.start;
int rangeEnd = range.end;
const int uidx = 1 - yIdx + uIdx * 2;
const int vidx = (2 + uidx) % 4;
@@ -3266,24 +3258,20 @@ template<int bIdx, int uIdx, int yIdx>
inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
{
YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows), converter);
parallel_for_(Range(0, _dst.rows), converter);
else
#endif
converter(BlockedRange(0, _dst.rows));
converter(Range(0, _dst.rows));
}
template<int bIdx, int uIdx, int yIdx>
inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
{
YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
#ifdef HAVE_TBB
if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
parallel_for(BlockedRange(0, _dst.rows), converter);
parallel_for_(Range(0, _dst.rows), converter);
else
#endif
converter(BlockedRange(0, _dst.rows));
converter(Range(0, _dst.rows));
}
/////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////

View File

@@ -443,7 +443,7 @@ icvGetDistanceTransformMask( int maskType, float *metrics )
namespace cv
{
struct DTColumnInvoker
struct DTColumnInvoker : ParallelLoopBody
{
DTColumnInvoker( const CvMat* _src, CvMat* _dst, const int* _sat_tab, const float* _sqr_tab)
{
@@ -453,9 +453,9 @@ struct DTColumnInvoker
sqr_tab = _sqr_tab;
}
void operator()( const BlockedRange& range ) const
void operator()( const Range& range ) const
{
int i, i1 = range.begin(), i2 = range.end();
int i, i1 = range.start, i2 = range.end;
int m = src->rows;
size_t sstep = src->step, dstep = dst->step/sizeof(float);
AutoBuffer<int> _d(m);
@@ -490,7 +490,7 @@ struct DTColumnInvoker
};
struct DTRowInvoker
struct DTRowInvoker : ParallelLoopBody
{
DTRowInvoker( CvMat* _dst, const float* _sqr_tab, const float* _inv_tab )
{
@@ -499,10 +499,10 @@ struct DTRowInvoker
inv_tab = _inv_tab;
}
void operator()( const BlockedRange& range ) const
void operator()( const Range& range ) const
{
const float inf = 1e15f;
int i, i1 = range.begin(), i2 = range.end();
int i, i1 = range.start, i2 = range.end;
int n = dst->cols;
AutoBuffer<uchar> _buf((n+2)*2*sizeof(float) + (n+2)*sizeof(int));
float* f = (float*)(uchar*)_buf;
@@ -586,7 +586,7 @@ icvTrueDistTrans( const CvMat* src, CvMat* dst )
for( ; i <= m*3; i++ )
sat_tab[i] = i - shift;
cv::parallel_for(cv::BlockedRange(0, n), cv::DTColumnInvoker(src, dst, sat_tab, sqr_tab));
cv::parallel_for_(cv::Range(0, n), cv::DTColumnInvoker(src, dst, sat_tab, sqr_tab));
// stage 2: compute modified distance transform for each row
float* inv_tab = sqr_tab + n;
@@ -598,7 +598,7 @@ icvTrueDistTrans( const CvMat* src, CvMat* dst )
sqr_tab[i] = (float)(i*i);
}
cv::parallel_for(cv::BlockedRange(0, m), cv::DTRowInvoker(dst, sqr_tab, inv_tab));
cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(dst, sqr_tab, inv_tab));
}

View File

@@ -2986,29 +2986,23 @@ cvCalcProbDensity( const CvHistogram* hist, const CvHistogram* hist_mask,
}
}
class EqualizeHistCalcHist_Invoker
class EqualizeHistCalcHist_Invoker : public cv::ParallelLoopBody
{
public:
enum {HIST_SZ = 256};
#ifdef HAVE_TBB
typedef tbb::mutex* MutextPtr;
#else
typedef void* MutextPtr;
#endif
EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, MutextPtr histogramLock)
EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, cv::Mutex* histogramLock)
: src_(src), globalHistogram_(histogram), histogramLock_(histogramLock)
{ }
void operator()( const cv::BlockedRange& rowRange ) const
void operator()( const cv::Range& rowRange ) const
{
int localHistogram[HIST_SZ] = {0, };
const size_t sstep = src_.step;
int width = src_.cols;
int height = rowRange.end() - rowRange.begin();
int height = rowRange.end - rowRange.start;
if (src_.isContinuous())
{
@@ -3016,7 +3010,7 @@ public:
height = 1;
}
for (const uchar* ptr = src_.ptr<uchar>(rowRange.begin()); height--; ptr += sstep)
for (const uchar* ptr = src_.ptr<uchar>(rowRange.start); height--; ptr += sstep)
{
int x = 0;
for (; x <= width - 4; x += 4)
@@ -3031,9 +3025,7 @@ public:
localHistogram[ptr[x]]++;
}
#ifdef HAVE_TBB
tbb::mutex::scoped_lock lock(*histogramLock_);
#endif
cv::AutoLock lock(*histogramLock_);
for( int i = 0; i < HIST_SZ; i++ )
globalHistogram_[i] += localHistogram[i];
@@ -3041,12 +3033,7 @@ public:
static bool isWorthParallel( const cv::Mat& src )
{
#ifdef HAVE_TBB
return ( src.total() >= 640*480 );
#else
(void)src;
return false;
#endif
}
private:
@@ -3054,10 +3041,10 @@ private:
cv::Mat& src_;
int* globalHistogram_;
MutextPtr histogramLock_;
cv::Mutex* histogramLock_;
};
class EqualizeHistLut_Invoker
class EqualizeHistLut_Invoker : public cv::ParallelLoopBody
{
public:
EqualizeHistLut_Invoker( cv::Mat& src, cv::Mat& dst, int* lut )
@@ -3066,13 +3053,13 @@ public:
lut_(lut)
{ }
void operator()( const cv::BlockedRange& rowRange ) const
void operator()( const cv::Range& rowRange ) const
{
const size_t sstep = src_.step;
const size_t dstep = dst_.step;
int width = src_.cols;
int height = rowRange.end() - rowRange.begin();
int height = rowRange.end - rowRange.start;
int* lut = lut_;
if (src_.isContinuous() && dst_.isContinuous())
@@ -3081,8 +3068,8 @@ public:
height = 1;
}
const uchar* sptr = src_.ptr<uchar>(rowRange.begin());
uchar* dptr = dst_.ptr<uchar>(rowRange.begin());
const uchar* sptr = src_.ptr<uchar>(rowRange.start);
uchar* dptr = dst_.ptr<uchar>(rowRange.start);
for (; height--; sptr += sstep, dptr += dstep)
{
@@ -3111,12 +3098,7 @@ public:
static bool isWorthParallel( const cv::Mat& src )
{
#ifdef HAVE_TBB
return ( src.total() >= 640*480 );
#else
(void)src;
return false;
#endif
}
private:
@@ -3143,23 +3125,18 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
if(src.empty())
return;
#ifdef HAVE_TBB
tbb::mutex histogramLockInstance;
EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = &histogramLockInstance;
#else
EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = 0;
#endif
Mutex histogramLockInstance;
const int hist_sz = EqualizeHistCalcHist_Invoker::HIST_SZ;
int hist[hist_sz] = {0,};
int lut[hist_sz];
EqualizeHistCalcHist_Invoker calcBody(src, hist, histogramLock);
EqualizeHistCalcHist_Invoker calcBody(src, hist, &histogramLockInstance);
EqualizeHistLut_Invoker lutBody(src, dst, lut);
cv::BlockedRange heightRange(0, src.rows);
cv::Range heightRange(0, src.rows);
if(EqualizeHistCalcHist_Invoker::isWorthParallel(src))
parallel_for(heightRange, calcBody);
parallel_for_(heightRange, calcBody);
else
calcBody(heightRange);
@@ -3183,7 +3160,7 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
}
if(EqualizeHistLut_Invoker::isWorthParallel(src))
parallel_for(heightRange, lutBody);
parallel_for_(heightRange, lutBody);
else
lutBody(heightRange);
}

View File

@@ -1081,7 +1081,7 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor)
namespace cv
{
class MorphologyRunner
class MorphologyRunner : public ParallelLoopBody
{
public:
MorphologyRunner(Mat _src, Mat _dst, int _nStripes, int _iterations,
@@ -1102,14 +1102,14 @@ public:
columnBorderType = _columnBorderType;
}
void operator () ( const BlockedRange& range ) const
void operator () ( const Range& range ) const
{
int row0 = min(cvRound(range.begin() * src.rows / nStripes), src.rows);
int row1 = min(cvRound(range.end() * src.rows / nStripes), src.rows);
int row0 = min(cvRound(range.start * src.rows / nStripes), src.rows);
int row1 = min(cvRound(range.end * src.rows / nStripes), src.rows);
/*if(0)
printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
src.rows, src.cols, range.begin(), range.end(), row0, row1);*/
src.rows, src.cols, range.start, range.end, row0, row1);*/
Mat srcStripe = src.rowRange(row0, row1);
Mat dstStripe = dst.rowRange(row0, row1);
@@ -1173,15 +1173,15 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
}
int nStripes = 1;
#if defined HAVE_TBB && defined HAVE_TEGRA_OPTIMIZATION
#if defined HAVE_TEGRA_OPTIMIZATION
if (src.data != dst.data && iterations == 1 && //NOTE: threads are not used for inplace processing
(borderType & BORDER_ISOLATED) == 0 && //TODO: check border types
src.rows >= 64 ) //NOTE: just heuristics
nStripes = 4;
#endif
parallel_for(BlockedRange(0, nStripes),
MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
parallel_for_(Range(0, nStripes),
MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
//Ptr<FilterEngine> f = createMorphologyFilter(op, src.type(),
// kernel, anchor, borderType, borderType, borderValue );