added recommended number of stripes to parallel_for_, modified some of the functions to use larger stripes (for better performance)

This commit is contained in:
Vadim Pisarevsky
2012-10-11 22:37:14 +04:00
parent 821de96346
commit 354a5f2686
7 changed files with 119 additions and 103 deletions

View File

@@ -187,7 +187,7 @@ private:
template <typename Cvt>
void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
{
parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt));
parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
}
////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////

View File

@@ -357,7 +357,7 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
Range range(0, dsize.height);
resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
parallel_for_(range, invoker);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
@@ -1222,7 +1222,7 @@ static void resizeGeneric_( const Mat& src, Mat& dst,
Range range(0, dsize.height);
resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
ssize, dsize, ksize, xmin, xmax);
parallel_for_(range, invoker);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
template <typename T, typename WT>
@@ -1381,7 +1381,7 @@ static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int
Range range(0, dst.rows);
resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
scale_y, ofs, xofs);
parallel_for_(range, invoker);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
struct DecimateAlpha
@@ -2680,14 +2680,14 @@ typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
const Mat& _fxy, const void* _wtab,
int borderType, const Scalar& _borderValue);
class remapInvoker :
class RemapInvoker :
public ParallelLoopBody
{
public:
remapInvoker(const Mat& _src, Mat _dst, const Mat& _map1, const Mat& _map2, const Mat *_m1,
RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
const Mat *_m2, int _interpolation, int _borderType, const Scalar &_borderValue,
int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
ParallelLoopBody(), src(_src), dst(_dst), map1(_map1), map2(_map2), m1(_m1), m2(_m2),
ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
interpolation(_interpolation), borderType(_borderType), borderValue(_borderValue),
planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
{
@@ -2697,9 +2697,9 @@ public:
{
int x, y, x1, y1;
const int buf_size = 1 << 14;
int brows0 = std::min(128, dst.rows), map_depth = map1.depth();
int bcols0 = std::min(buf_size/brows0, dst.cols);
brows0 = std::min(buf_size/bcols0, dst.rows);
int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
int bcols0 = std::min(buf_size/brows0, dst->cols);
brows0 = std::min(buf_size/bcols0, dst->rows);
#if CV_SSE2
bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
#endif
@@ -2710,17 +2710,17 @@ public:
for( y = range.start; y < range.end; y += brows0 )
{
for( x = 0; x < dst.cols; x += bcols0 )
for( x = 0; x < dst->cols; x += bcols0 )
{
int brows = std::min(brows0, range.end - y);
int bcols = std::min(bcols0, dst.cols - x);
Mat dpart(dst, Rect(x, y, bcols, brows));
int bcols = std::min(bcols0, dst->cols - x);
Mat dpart(*dst, Rect(x, y, bcols, brows));
Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
if( nnfunc )
{
if( map1.type() == CV_16SC2 && !map2.data ) // the data is already in the right format
bufxy = map1(Rect(x, y, bcols, brows));
if( m1->type() == CV_16SC2 && !m2->data ) // the data is already in the right format
bufxy = (*m1)(Rect(x, y, bcols, brows));
else if( map_depth != CV_32F )
{
for( y1 = 0; y1 < brows; y1++ )
@@ -2738,14 +2738,14 @@ public:
}
}
else if( !planar_input )
map1(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
(*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
else
{
for( y1 = 0; y1 < brows; y1++ )
{
short* XY = (short*)(bufxy.data + bufxy.step*y1);
const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x;
const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x;
x1 = 0;
#if CV_SSE2
@@ -2778,7 +2778,7 @@ public:
}
}
}
nnfunc( src, dpart, bufxy, borderType, borderValue );
nnfunc( *src, dpart, bufxy, borderType, borderValue );
continue;
}
@@ -2788,16 +2788,15 @@ public:
short* XY = (short*)(bufxy.data + bufxy.step*y1);
ushort* A = (ushort*)(bufa.data + bufa.step*y1);
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
{
bufxy = m1->operator()(Rect(x, y, bcols, brows));
bufa = m2->operator()(Rect(x, y, bcols, brows));
bufxy = (*m1)(Rect(x, y, bcols, brows));
bufa = (*m2)(Rect(x, y, bcols, brows));
}
else if( planar_input )
{
const float* sX = (const float*)(map1.data + map1.step*(y+y1)) + x;
const float* sY = (const float*)(map2.data + map2.step*(y+y1)) + x;
const float* sX = (const float*)(m1->data + m1->step*(y+y1)) + x;
const float* sY = (const float*)(m2->data + m2->step*(y+y1)) + x;
x1 = 0;
#if CV_SSE2
@@ -2850,7 +2849,7 @@ public:
}
else
{
const float* sXY = (const float*)(map1.data + map1.step*(y+y1)) + x*2;
const float* sXY = (const float*)(m1->data + m1->step*(y+y1)) + x*2;
for( x1 = 0; x1 < bcols; x1++ )
{
@@ -2863,15 +2862,14 @@ public:
}
}
}
ifunc(src, dpart, bufxy, bufa, ctab, borderType, borderValue);
ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
}
}
}
private:
Mat src;
Mat dst;
Mat map1, map2;
const Mat* src;
Mat* dst;
const Mat *m1, *m2;
int interpolation, borderType;
Scalar borderValue;
@@ -2961,8 +2959,8 @@ void cv::remap( InputArray _src, OutputArray _dst,
const Mat *m1 = &map1, *m2 = &map2;
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1)) )
if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || !map2.data)) ||
(map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || !map1.data)) )
{
if( map1.type() != CV_16SC2 )
std::swap(m1, m2);
@@ -2974,11 +2972,10 @@ void cv::remap( InputArray _src, OutputArray _dst,
planar_input = map1.channels() == 1;
}
Range range(0, dst.rows);
remapInvoker invoker(src, dst, map1, map2, m1, m2, interpolation,
RemapInvoker invoker(src, dst, m1, m2, interpolation,
borderType, borderValue, planar_input, nnfunc, ifunc,
ctab);
parallel_for_(range, invoker);
parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
}
@@ -3300,7 +3297,7 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
Range range(0, dst.rows);
warpAffineInvoker invoker(src, dst, interpolation, borderType,
borderValue, adelta, bdelta, M);
parallel_for_(range, invoker);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}
@@ -3430,7 +3427,7 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
Range range(0, dst.rows);
warpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
parallel_for_(range, invoker);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
}

View File

@@ -1919,7 +1919,7 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
}
BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
parallel_for_(Range(0, size.height), body);
parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
}
@@ -2189,7 +2189,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
// parallel_for usage
BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
parallel_for_(Range(0, size.height), body);
parallel_for_(Range(0, size.height), body, dst.total()/(double)(1<<16));
}
}

View File

@@ -664,13 +664,11 @@ getThreshVal_Otsu_8u( const Mat& _src )
class ThresholdRunner : public ParallelLoopBody
{
public:
ThresholdRunner(Mat _src, Mat _dst, int _nStripes, double _thresh, double _maxval, int _thresholdType)
ThresholdRunner(Mat _src, Mat _dst, double _thresh, double _maxval, int _thresholdType)
{
src = _src;
dst = _dst;
nStripes = _nStripes;
thresh = _thresh;
maxval = _maxval;
thresholdType = _thresholdType;
@@ -678,13 +676,8 @@ public:
void operator () ( const Range& range ) const
{
int row0 = std::min(cvRound(range.start * src.rows / nStripes), src.rows);
int row1 = range.end >= nStripes ? src.rows :
std::min(cvRound(range.end * src.rows / nStripes), src.rows);
/*if(0)
printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
src.rows, src.cols, range.begin(), range.end(), row0, row1);*/
int row0 = range.start;
int row1 = range.end;
Mat srcStripe = src.rowRange(row0, row1);
Mat dstStripe = dst.rowRange(row0, row1);
@@ -789,10 +782,9 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
else
CV_Error( CV_StsUnsupportedFormat, "" );
size_t nStripes = (src.total() + (1<<15)) >> 16;
nStripes = MAX(MIN(nStripes, (size_t)4), (size_t)1);
parallel_for_(Range(0, (int)nStripes),
ThresholdRunner(src, dst, nStripes, thresh, maxval, type));
parallel_for_(Range(0, dst.rows),
ThresholdRunner(src, dst, thresh, maxval, type),
dst.total()/(double)(1<<16));
return thresh;
}