Merge pull request #2956 from ilya-lavrenov:tapi_accumulate
This commit is contained in:
commit
b332152bef
@ -598,7 +598,27 @@ CV_EXPORTS const char* typeToStr(int t);
|
|||||||
CV_EXPORTS const char* memopTypeToStr(int t);
|
CV_EXPORTS const char* memopTypeToStr(int t);
|
||||||
CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
|
CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
|
||||||
CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
|
CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
|
||||||
|
|
||||||
|
|
||||||
|
enum OclVectorStrategy
|
||||||
|
{
|
||||||
|
// all matrices have its own vector width
|
||||||
|
OCL_VECTOR_OWN = 0,
|
||||||
|
// all matrices have maximal vector width among all matrices
|
||||||
|
// (useful for cases when matrices have different data types)
|
||||||
|
OCL_VECTOR_MAX = 1,
|
||||||
|
|
||||||
|
// default strategy
|
||||||
|
OCL_VECTOR_DEFAULT = OCL_VECTOR_OWN
|
||||||
|
};
|
||||||
|
|
||||||
CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
|
CV_EXPORTS int predictOptimalVectorWidth(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
|
||||||
|
InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
|
||||||
|
InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray(),
|
||||||
|
OclVectorStrategy strat = OCL_VECTOR_DEFAULT);
|
||||||
|
|
||||||
|
// with OCL_VECTOR_MAX strategy
|
||||||
|
CV_EXPORTS int predictOptimalVectorWidthMax(InputArray src1, InputArray src2 = noArray(), InputArray src3 = noArray(),
|
||||||
InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
|
InputArray src4 = noArray(), InputArray src5 = noArray(), InputArray src6 = noArray(),
|
||||||
InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());
|
InputArray src7 = noArray(), InputArray src8 = noArray(), InputArray src9 = noArray());
|
||||||
|
|
||||||
|
@ -4451,42 +4451,46 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
|
|||||||
if (!src.empty()) \
|
if (!src.empty()) \
|
||||||
{ \
|
{ \
|
||||||
CV_Assert(src.isMat() || src.isUMat()); \
|
CV_Assert(src.isMat() || src.isUMat()); \
|
||||||
int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
|
|
||||||
Size csize = src.size(); \
|
Size csize = src.size(); \
|
||||||
cols.push_back(ccn * csize.width); \
|
int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
|
||||||
if (ctype != type) \
|
ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
|
||||||
|
if (cwidth < ckercn || ckercn <= 0) \
|
||||||
|
return 1; \
|
||||||
|
cols.push_back(cwidth); \
|
||||||
|
if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
|
||||||
return 1; \
|
return 1; \
|
||||||
offsets.push_back(src.offset()); \
|
offsets.push_back(src.offset()); \
|
||||||
steps.push_back(src.step()); \
|
steps.push_back(src.step()); \
|
||||||
|
dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
|
||||||
|
kercns.push_back(ckercn); \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
while ((void)0, 0)
|
while ((void)0, 0)
|
||||||
|
|
||||||
int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
|
int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
|
||||||
InputArray src4, InputArray src5, InputArray src6,
|
InputArray src4, InputArray src5, InputArray src6,
|
||||||
InputArray src7, InputArray src8, InputArray src9)
|
InputArray src7, InputArray src8, InputArray src9,
|
||||||
|
OclVectorStrategy strat)
|
||||||
{
|
{
|
||||||
int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz1 = CV_ELEM_SIZE1(depth);
|
|
||||||
Size ssize = src1.size();
|
|
||||||
const ocl::Device & d = ocl::Device::getDefault();
|
const ocl::Device & d = ocl::Device::getDefault();
|
||||||
|
int ref_type = src1.type();
|
||||||
|
|
||||||
int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
|
int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
|
||||||
d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
|
d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
|
||||||
d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
|
d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
|
||||||
d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth];
|
d.preferredVectorWidthDouble(), -1 };
|
||||||
|
|
||||||
// if the device says don't use vectors
|
// if the device says don't use vectors
|
||||||
if (vectorWidths[0] == 1)
|
if (vectorWidths[0] == 1)
|
||||||
{
|
{
|
||||||
// it's heuristic
|
// it's heuristic
|
||||||
int vectorWidthsOthers[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
|
vectorWidths[CV_8U] = vectorWidths[CV_8S] = 16;
|
||||||
kercn = vectorWidthsOthers[depth];
|
vectorWidths[CV_16U] = vectorWidths[CV_16S] = 8;
|
||||||
|
vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ssize.width * cn < kercn || kercn <= 0)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
std::vector<size_t> offsets, steps, cols;
|
std::vector<size_t> offsets, steps, cols;
|
||||||
|
std::vector<int> dividers, kercns;
|
||||||
PROCESS_SRC(src1);
|
PROCESS_SRC(src1);
|
||||||
PROCESS_SRC(src2);
|
PROCESS_SRC(src2);
|
||||||
PROCESS_SRC(src3);
|
PROCESS_SRC(src3);
|
||||||
@ -4498,27 +4502,24 @@ int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
|
|||||||
PROCESS_SRC(src9);
|
PROCESS_SRC(src9);
|
||||||
|
|
||||||
size_t size = offsets.size();
|
size_t size = offsets.size();
|
||||||
int wsz = kercn * esz1;
|
|
||||||
std::vector<int> dividers(size, wsz);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < size; ++i)
|
for (size_t i = 0; i < size; ++i)
|
||||||
while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
|
while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
|
||||||
dividers[i] >>= 1;
|
dividers[i] >>= 1, kercns[i] >>= 1;
|
||||||
|
|
||||||
// default strategy
|
// default strategy
|
||||||
for (size_t i = 0; i < size; ++i)
|
int kercn = *std::min_element(kercns.begin(), kercns.end());
|
||||||
if (dividers[i] != wsz)
|
|
||||||
{
|
|
||||||
kercn = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// another strategy
|
|
||||||
// width = *std::min_element(dividers.begin(), dividers.end());
|
|
||||||
|
|
||||||
return kercn;
|
return kercn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
|
||||||
|
InputArray src4, InputArray src5, InputArray src6,
|
||||||
|
InputArray src7, InputArray src8, InputArray src9)
|
||||||
|
{
|
||||||
|
return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
#undef PROCESS_SRC
|
#undef PROCESS_SRC
|
||||||
|
|
||||||
|
|
||||||
|
@ -369,11 +369,10 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
|
CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
|
||||||
op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
|
op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
|
||||||
|
|
||||||
int stype = _src.type(), cn = CV_MAT_CN(stype);
|
const ocl::Device & dev = ocl::Device::getDefault();
|
||||||
int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth();
|
bool haveMask = !_mask.empty(), doubleSupport = dev.doubleFPConfig() > 0;
|
||||||
|
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth();
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
|
int kercn = haveMask ? cn : ocl::predictOptimalVectorWidthMax(_src, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
|
||||||
haveMask = !_mask.empty();
|
|
||||||
|
|
||||||
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
||||||
return false;
|
return false;
|
||||||
@ -381,11 +380,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
|
const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
|
||||||
"ACCUMULATE_WEIGHTED" };
|
"ACCUMULATE_WEIGHTED" };
|
||||||
|
|
||||||
|
char cvt[40];
|
||||||
ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
|
ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
|
||||||
format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s",
|
format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s",
|
||||||
opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
|
opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
|
||||||
ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth),
|
ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth),
|
||||||
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI,
|
||||||
|
ocl::convertTypeStr(sdepth, ddepth, 1, cvt)));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -393,7 +394,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
|
|
||||||
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
|
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
|
||||||
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
||||||
dstarg = ocl::KernelArg::ReadWrite(dst),
|
dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn),
|
||||||
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
|
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
|
||||||
|
|
||||||
int argidx = k.set(0, srcarg);
|
int argidx = k.set(0, srcarg);
|
||||||
@ -410,7 +411,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
|
|||||||
if (haveMask)
|
if (haveMask)
|
||||||
k.set(argidx, maskarg);
|
k.set(argidx, maskarg);
|
||||||
|
|
||||||
size_t globalsize[2] = { src.cols, src.rows };
|
size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
|
||||||
return k.run(2, globalsize, NULL, false);
|
return k.run(2, globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,13 +13,18 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define SRC_TSIZE cn * (int)sizeof(srcT1)
|
||||||
|
#define DST_TSIZE cn * (int)sizeof(dstT1)
|
||||||
|
|
||||||
|
#define noconvert
|
||||||
|
|
||||||
__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
|
__kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
|
||||||
#ifdef ACCUMULATE_PRODUCT
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
__global const uchar * src2ptr, int src2_step, int src2_offset,
|
||||||
#endif
|
#endif
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
|
__global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
|
||||||
#ifdef ACCUMULATE_WEIGHTED
|
#ifdef ACCUMULATE_WEIGHTED
|
||||||
, dstT alpha
|
, dstT1 alpha
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAVE_MASK
|
#ifdef HAVE_MASK
|
||||||
, __global const uchar * mask, int mask_step, int mask_offset
|
, __global const uchar * mask, int mask_step, int mask_offset
|
||||||
@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
int y = get_global_id(1);
|
int y = get_global_id(1) * rowsPerWI;
|
||||||
|
|
||||||
if (x < dst_cols && y < dst_rows)
|
if (x < dst_cols)
|
||||||
{
|
{
|
||||||
int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT));
|
int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset));
|
||||||
#ifdef HAVE_MASK
|
#ifdef HAVE_MASK
|
||||||
int mask_index = mad24(y, mask_step, mask_offset + x);
|
int mask_index = mad24(y, mask_step, mask_offset + x);
|
||||||
mask += mask_index;
|
mask += mask_index;
|
||||||
#endif
|
#endif
|
||||||
int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT));
|
|
||||||
|
|
||||||
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
|
|
||||||
#ifdef ACCUMULATE_PRODUCT
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT));
|
int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset));
|
||||||
__global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
|
|
||||||
#endif
|
#endif
|
||||||
__global dstT * dst = (__global dstT *)(dstptr + dst_index);
|
int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset));
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int c = 0; c < cn; ++c)
|
for (int i = 0; i < rowsPerWI; ++i)
|
||||||
|
if (y < dst_rows)
|
||||||
|
{
|
||||||
|
__global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index);
|
||||||
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
|
__global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
|
||||||
|
#endif
|
||||||
|
__global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index);
|
||||||
|
|
||||||
#ifdef HAVE_MASK
|
#ifdef HAVE_MASK
|
||||||
if (mask[0])
|
if (mask[0])
|
||||||
#endif
|
#endif
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
{
|
||||||
#ifdef ACCUMULATE
|
#ifdef ACCUMULATE
|
||||||
dst[c] += src[c];
|
dst[c] += convertToDT(src[c]);
|
||||||
#elif defined ACCUMULATE_SQUARE
|
#elif defined ACCUMULATE_SQUARE
|
||||||
dst[c] += src[c] * src[c];
|
dstT1 val = convertToDT(src[c]);
|
||||||
|
dst[c] = fma(val, val, dst[c]);
|
||||||
#elif defined ACCUMULATE_PRODUCT
|
#elif defined ACCUMULATE_PRODUCT
|
||||||
dst[c] += src[c] * src2[c];
|
dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]);
|
||||||
#elif defined ACCUMULATE_WEIGHTED
|
#elif defined ACCUMULATE_WEIGHTED
|
||||||
dst[c] = (1 - alpha) * dst[c] + src[c] * alpha;
|
dst[c] = fma(1 - alpha, dst[c], src[c] * alpha);
|
||||||
#else
|
#else
|
||||||
#error "Unknown accumulation type"
|
#error "Unknown accumulation type"
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
src_index += src_step;
|
||||||
|
#ifdef ACCUMULATE_PRODUCT
|
||||||
|
src2_index += src2_step;
|
||||||
|
#endif
|
||||||
|
#ifdef HAVE_MASK
|
||||||
|
mask += mask_step;
|
||||||
|
#endif
|
||||||
|
dst_index += dst_step;
|
||||||
|
++y;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user