optimization of cv::accumulate**
This commit is contained in:
parent
4b84d5d2c1
commit
a350b76738
@ -369,11 +369,17 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
|
CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
|
||||||
op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
|
op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
|
||||||
|
|
||||||
int stype = _src.type(), cn = CV_MAT_CN(stype);
|
const ocl::Device & dev = ocl::Device::getDefault();
|
||||||
int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth();
|
int vectorWidths[] = { 4, 4, 2, 2, 1, 1, 1, -1 };
|
||||||
|
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth();
|
||||||
|
int pcn = std::max(vectorWidths[sdepth], vectorWidths[ddepth]), sesz = CV_ELEM_SIZE(sdepth) * pcn,
|
||||||
|
desz = CV_ELEM_SIZE(ddepth) * pcn, rowsPerWI = dev.isIntel() ? 4 : 1;
|
||||||
|
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
|
bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
|
||||||
haveMask = !_mask.empty();
|
usepcn = _src.offset() % sesz == 0 && _src.step() % sesz == 0 && (_src.cols() * cn) % pcn == 0 &&
|
||||||
|
_src2.offset() % desz == 0 && _src2.step() % desz == 0 &&
|
||||||
|
_dst.offset() % pcn == 0 && _dst.step() % desz == 0 && !haveMask;
|
||||||
|
int kercn = usepcn ? pcn : haveMask ? cn : 1;
|
||||||
|
|
||||||
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
||||||
return false;
|
return false;
|
||||||
@ -381,11 +387,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
|
const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
|
||||||
"ACCUMULATE_WEIGHTED" };
|
"ACCUMULATE_WEIGHTED" };
|
||||||
|
|
||||||
|
char cvt[40];
|
||||||
ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
|
ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
|
||||||
format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s",
|
format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s",
|
||||||
opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
|
opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
|
||||||
ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth),
|
ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth),
|
||||||
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI,
|
||||||
|
ocl::convertTypeStr(sdepth, ddepth, 1, cvt)));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -393,7 +401,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
|
|
||||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
|
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
|
||||||
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
||||||
dstarg = ocl::KernelArg::ReadWrite(dst),
|
dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn),
|
||||||
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
|
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
|
||||||
|
|
||||||
int argidx = k.set(0, srcarg);
|
int argidx = k.set(0, srcarg);
|
||||||
@ -410,7 +418,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
if (haveMask)
|
if (haveMask)
|
||||||
k.set(argidx, maskarg);
|
k.set(argidx, maskarg);
|
||||||
|
|
||||||
size_t globalsize[2] = { src.cols, src.rows };
|
size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
|
||||||
return k.run(2, globalsize, NULL, false);
|
return k.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,13 +13,18 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define SRC_TSIZE cn * (int)sizeof(srcT1)
|
||||||
|
#define DST_TSIZE cn * (int)sizeof(dstT1)
|
||||||
|
|
||||||
|
#define noconvert
|
||||||
|
|
||||||
__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
|
__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
|
||||||
#ifdef ACCUMULATE_PRODUCT
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
||||||
#endif
|
#endif
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
|
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
|
||||||
#ifdef ACCUMULATE_WEIGHTED
|
#ifdef ACCUMULATE_WEIGHTED
|
||||||
, dstT alpha
|
, dstT1 alpha
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_MASK
|
#ifdef HAVE_MASK
|
||||||
, __global const uchar * mask, int mask_step, int mask_offset
|
, __global const uchar * mask, int mask_step, int mask_offset
|
||||||
@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < dst_cols && y < dst_rows)
|
if (x < dst_cols)
|
||||||
{
|
{
|
||||||
int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT));
|
int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset));
|
||||||
#ifdef HAVE_MASK
|
#ifdef HAVE_MASK
|
||||||
int mask_index = mad24(y, mask_step, mask_offset + x);
|
int mask_index = mad24(y, mask_step, mask_offset + x);
|
||||||
mask += mask_index;
|
mask += mask_index;
|
||||||
#endif
|
#endif
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT));
|
|
||||||
|
|
||||||
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
|
|
||||||
#ifdef ACCUMULATE_PRODUCT
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT));
|
int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset));
|
||||||
__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
|
|
||||||
#endif
|
#endif
|
||||||
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
|
int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset));
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int c = 0; c < cn; ++c)
|
for (int i = 0; i < rowsPerWI; ++i)
|
||||||
|
if (y < dst_rows)
|
||||||
|
{
|
||||||
|
__global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index);
|
||||||
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
|
__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
|
||||||
|
#endif
|
||||||
|
__global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index);
|
||||||
|
|
||||||
#ifdef HAVE_MASK
|
#ifdef HAVE_MASK
|
||||||
if (mask[0])
|
if (mask[0])
|
||||||
#endif
|
#endif
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
{
|
||||||
#ifdef ACCUMULATE
|
#ifdef ACCUMULATE
|
||||||
dst[c] += src[c];
|
dst[c] += convertToDT(src[c]);
|
||||||
#elif defined ACCUMULATE_SQUARE
|
#elif defined ACCUMULATE_SQUARE
|
||||||
dst[c] += src[c] * src[c];
|
dstT1 val = convertToDT(src[c]);
|
||||||
|
dst[c] = fma(val, val, dst[c]);
|
||||||
#elif defined ACCUMULATE_PRODUCT
|
#elif defined ACCUMULATE_PRODUCT
|
||||||
dst[c] += src[c] * src2[c];
|
dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]);
|
||||||
#elif defined ACCUMULATE_WEIGHTED
|
#elif defined ACCUMULATE_WEIGHTED
|
||||||
dst[c] = (1 - alpha) * dst[c] + src[c] * alpha;
|
dst[c] = fma(1 - alpha, dst[c], src[c] * alpha);
|
||||||
#else
|
#else
|
||||||
#error "Unknown accumulation type"
|
#error "Unknown accumulation type"
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src_index += src_step;
|
||||||
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
|
src2_index += src2_step;
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_MASK
|
||||||
|
mask += mask_step;
|
||||||
|
#endif
|
||||||
|
dst_index += dst_step;
|
||||||
|
++y;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user