improved performance of cv::ocl::minMax
This commit is contained in:
parent
836635d2d5
commit
a5afcd9f11
@ -473,20 +473,13 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
|
|||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template <typename T, typename WT>
|
template <typename T, typename WT>
|
||||||
static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem &dst, int groupnum, string kernelName)
|
static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem &dst, int vlen, int groupnum)
|
||||||
{
|
{
|
||||||
int all_cols = src.step / src.elemSize();
|
|
||||||
int pre_cols = (src.offset % src.step) / src.elemSize();
|
|
||||||
int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
|
|
||||||
int invalid_cols = pre_cols + sec_cols;
|
|
||||||
int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
|
|
||||||
int offset = src.offset / src.elemSize();
|
|
||||||
|
|
||||||
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" };
|
||||||
const char * const channelMap[] = { " ", " ", "2", "4", "4" };
|
const char * const channelMap[] = { " ", " ", "2", "4", "4", "", "", "", "8" };
|
||||||
|
|
||||||
ostringstream stream;
|
ostringstream stream;
|
||||||
stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()];
|
stream << "-D T=" << typeMap[src.depth()] << channelMap[vlen];
|
||||||
if (numeric_limits<T>::is_integer)
|
if (numeric_limits<T>::is_integer)
|
||||||
{
|
{
|
||||||
stream << " -D MAX_VAL=" << (WT)numeric_limits<T>::max();
|
stream << " -D MAX_VAL=" << (WT)numeric_limits<T>::max();
|
||||||
@ -494,38 +487,38 @@ static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
stream << " -D DEPTH_" << src.depth();
|
stream << " -D DEPTH_" << src.depth();
|
||||||
|
stream << " -D vlen=" << vlen;
|
||||||
std::string buildOptions = stream.str();
|
std::string buildOptions = stream.str();
|
||||||
|
|
||||||
|
int vElemSize = src.elemSize1() * vlen, src_cols = src.cols / vlen;
|
||||||
|
int src_step = src.step / vElemSize, src_offset = src.offset / vElemSize;
|
||||||
|
int mask_step = mask.step / vlen, mask_offset = mask.offset / vlen;
|
||||||
|
int total = src.size().area() / vlen;
|
||||||
|
|
||||||
vector<pair<size_t , const void *> > args;
|
vector<pair<size_t , const void *> > args;
|
||||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
|
||||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step ));
|
||||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_offset ));
|
||||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
|
||||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&src_cols ));
|
||||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&total));
|
||||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
|
||||||
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst));
|
||||||
|
|
||||||
int minvalid_cols = 0, moffset = 0;
|
|
||||||
if (!mask.empty())
|
if (!mask.empty())
|
||||||
{
|
{
|
||||||
int mall_cols = mask.step / mask.elemSize();
|
|
||||||
int mpre_cols = (mask.offset % mask.step) / mask.elemSize();
|
|
||||||
int msec_cols = mall_cols - (mask.offset % mask.step + mask.cols * mask.elemSize() - 1) / mask.elemSize() - 1;
|
|
||||||
minvalid_cols = mpre_cols + msec_cols;
|
|
||||||
moffset = mask.offset / mask.elemSize();
|
|
||||||
|
|
||||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&minvalid_cols ));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&mask_step ));
|
||||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset ));
|
args.push_back( make_pair( sizeof(cl_int) , (void *)&mask_offset ));
|
||||||
|
|
||||||
kernelName += "_mask";
|
buildOptions += " -D WITH_MASK";
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t globalThreads[3] = {groupnum * 256, 1, 1};
|
size_t globalThreads[3] = { groupnum * 256, 1, 1 };
|
||||||
size_t localThreads[3] = {256, 1, 1};
|
size_t localThreads[3] = { 256, 1, 1 };
|
||||||
|
|
||||||
// kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
|
// kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
|
||||||
openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, globalThreads, localThreads,
|
openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
|
||||||
args, -1, -1, buildOptions.c_str());
|
args, -1, -1, buildOptions.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -535,25 +528,33 @@ void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const
|
|||||||
size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
|
size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
|
||||||
CV_Assert(groupnum != 0);
|
CV_Assert(groupnum != 0);
|
||||||
|
|
||||||
int dbsize = groupnum * 2 * src.elemSize();
|
int vlen = mask.empty() ? 8 : 1, vElemSize = vlen * src.elemSize1();
|
||||||
|
while (src.offset % vElemSize != 0 || src.step % vElemSize != 0 || src.cols % vlen != 0)
|
||||||
|
{
|
||||||
|
vlen >>= 1;
|
||||||
|
vElemSize >>= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int dbsize = groupnum * 2 * vElemSize;
|
||||||
oclMat buf;
|
oclMat buf;
|
||||||
ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
|
ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
|
||||||
|
|
||||||
cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
|
cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
|
||||||
arithmetic_minMax_run<T, WT>(src, mask, buf_data, groupnum, "arithm_op_minMax");
|
arithmetic_minMax_run<T, WT>(src, mask, buf_data, vlen, groupnum);
|
||||||
|
|
||||||
Mat matbuf = Mat(buf);
|
Mat matbuf = Mat(buf);
|
||||||
T *p = matbuf.ptr<T>();
|
T *p = matbuf.ptr<T>();
|
||||||
|
|
||||||
if (minVal != NULL)
|
if (minVal != NULL)
|
||||||
{
|
{
|
||||||
*minVal = std::numeric_limits<double>::max();
|
*minVal = std::numeric_limits<double>::max();
|
||||||
for (int i = 0, end = src.oclchannels() * (int)groupnum; i < end; i++)
|
for (int i = 0, end = vlen * (int)groupnum; i < end; i++)
|
||||||
*minVal = *minVal < p[i] ? *minVal : p[i];
|
*minVal = *minVal < p[i] ? *minVal : p[i];
|
||||||
}
|
}
|
||||||
if (maxVal != NULL)
|
if (maxVal != NULL)
|
||||||
{
|
{
|
||||||
*maxVal = -std::numeric_limits<double>::max();
|
*maxVal = -std::numeric_limits<double>::max();
|
||||||
for (int i = src.oclchannels() * (int)groupnum, end = i << 1; i < end; i++)
|
for (int i = vlen * (int)groupnum, end = i << 1; i < end; i++)
|
||||||
*maxVal = *maxVal > p[i] ? *maxVal : p[i];
|
*maxVal = *maxVal > p[i] ? *maxVal : p[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -564,7 +565,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
|
|||||||
{
|
{
|
||||||
CV_Assert(src.channels() == 1);
|
CV_Assert(src.channels() == 1);
|
||||||
CV_Assert(src.size() == mask.size() || mask.empty());
|
CV_Assert(src.size() == mask.size() || mask.empty());
|
||||||
CV_Assert(src.step % src.elemSize() == 0);
|
CV_Assert(src.step % src.elemSize1() == 0);
|
||||||
|
|
||||||
if (minVal == NULL && maxVal == NULL)
|
if (minVal == NULL && maxVal == NULL)
|
||||||
return;
|
return;
|
||||||
|
@ -63,81 +63,31 @@
|
|||||||
|
|
||||||
/**************************************Array minMax**************************************/
|
/**************************************Array minMax**************************************/
|
||||||
|
|
||||||
__kernel void arithm_op_minMax(__global const T * src, __global T * dst,
|
__kernel void arithm_op_minMax(__global const T * src, int src_step, int src_offset, int src_rows, int src_cols,
|
||||||
int cols, int invalid_cols, int offset, int elemnum, int groupnum)
|
int total, int groupnum, __global T * dst
|
||||||
|
#ifdef WITH_MASK
|
||||||
|
, __global const uchar * mask, int mask_step, int mask_offset
|
||||||
|
#endif
|
||||||
|
)
|
||||||
{
|
{
|
||||||
int lid = get_local_id(0);
|
int lid = get_local_id(0);
|
||||||
int gid = get_group_id(0);
|
int gid = get_group_id(0);
|
||||||
int id = get_global_id(0);
|
int id = get_global_id(0);
|
||||||
|
|
||||||
int idx = offset + id + (id / cols) * invalid_cols;
|
|
||||||
|
|
||||||
__local T localmem_max[128], localmem_min[128];
|
__local T localmem_max[128], localmem_min[128];
|
||||||
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
|
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
|
||||||
|
int y, x;
|
||||||
|
|
||||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
for (int grainSize = groupnum << 8; id < total; id += grainSize)
|
||||||
{
|
{
|
||||||
idx = offset + id + (id / cols) * invalid_cols;
|
y = id / src_cols;
|
||||||
temp = src[idx];
|
x = id % src_cols;
|
||||||
minval = min(minval, temp);
|
|
||||||
maxval = max(maxval, temp);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lid > 127)
|
#ifdef WITH_MASK
|
||||||
{
|
if (mask[mad24(y, mask_step, x + mask_offset)])
|
||||||
localmem_min[lid - 128] = minval;
|
#endif
|
||||||
localmem_max[lid - 128] = maxval;
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
if (lid < 128)
|
|
||||||
{
|
|
||||||
localmem_min[lid] = min(minval, localmem_min[lid]);
|
|
||||||
localmem_max[lid] = max(maxval, localmem_max[lid]);
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
|
|
||||||
for (int lsize = 64; lsize > 0; lsize >>= 1)
|
|
||||||
{
|
|
||||||
if (lid < lsize)
|
|
||||||
{
|
{
|
||||||
int lid2 = lsize + lid;
|
temp = src[mad24(y, src_step, x + src_offset)];
|
||||||
localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
|
|
||||||
localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
|
|
||||||
}
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lid == 0)
|
|
||||||
{
|
|
||||||
dst[gid] = localmem_min[0];
|
|
||||||
dst[gid + groupnum] = localmem_max[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__kernel void arithm_op_minMax_mask(__global const T * src, __global T * dst,
|
|
||||||
int cols, int invalid_cols, int offset,
|
|
||||||
int elemnum, int groupnum,
|
|
||||||
const __global uchar * mask, int minvalid_cols, int moffset)
|
|
||||||
{
|
|
||||||
int lid = get_local_id(0);
|
|
||||||
int gid = get_group_id(0);
|
|
||||||
int id = get_global_id(0);
|
|
||||||
|
|
||||||
int idx = offset + id + (id / cols) * invalid_cols;
|
|
||||||
int midx = moffset + id + (id / cols) * minvalid_cols;
|
|
||||||
|
|
||||||
__local T localmem_max[128], localmem_min[128];
|
|
||||||
T minval = (T)(MAX_VAL), maxval = (T)(MIN_VAL), temp;
|
|
||||||
|
|
||||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
|
|
||||||
{
|
|
||||||
idx = offset + id + (id / cols) * invalid_cols;
|
|
||||||
midx = moffset + id + (id / cols) * minvalid_cols;
|
|
||||||
|
|
||||||
if (mask[midx])
|
|
||||||
{
|
|
||||||
temp = src[idx];
|
|
||||||
minval = min(minval, temp);
|
minval = min(minval, temp);
|
||||||
maxval = max(maxval, temp);
|
maxval = max(maxval, temp);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user