Merge pull request #2801 from ilya-lavrenov:tapi_reduction
This commit is contained in:
commit
f30301d171
@ -82,10 +82,24 @@
|
|||||||
|
|
||||||
#define noconvert
|
#define noconvert
|
||||||
|
|
||||||
|
#ifndef kercn
|
||||||
|
#define kercn 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_MASK_CONT
|
||||||
|
#define MASK_INDEX int mask_index = id + mask_offset;
|
||||||
|
#else
|
||||||
|
#define MASK_INDEX int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols))
|
||||||
|
#endif
|
||||||
|
|
||||||
#if cn != 3
|
#if cn != 3
|
||||||
#define loadpix(addr) *(__global const srcT *)(addr)
|
#define loadpix(addr) *(__global const srcT *)(addr)
|
||||||
#define storepix(val, addr) *(__global dstT *)(addr) = val
|
#define storepix(val, addr) *(__global dstT *)(addr) = val
|
||||||
|
#if kercn == 1
|
||||||
#define srcTSIZE (int)sizeof(srcT)
|
#define srcTSIZE (int)sizeof(srcT)
|
||||||
|
#else
|
||||||
|
#define srcTSIZE (int)sizeof(srcT1)
|
||||||
|
#endif
|
||||||
#define dstTSIZE (int)sizeof(dstT)
|
#define dstTSIZE (int)sizeof(dstT)
|
||||||
#else
|
#else
|
||||||
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
|
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
|
||||||
@ -130,21 +144,121 @@
|
|||||||
|
|
||||||
#ifdef HAVE_MASK
|
#ifdef HAVE_MASK
|
||||||
#define REDUCE_GLOBAL \
|
#define REDUCE_GLOBAL \
|
||||||
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
|
MASK_INDEX; \
|
||||||
if (mask[mask_index]) \
|
if (mask[mask_index]) \
|
||||||
{ \
|
{ \
|
||||||
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
|
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
|
||||||
FUNC(accumulator, temp); \
|
FUNC(accumulator, temp); \
|
||||||
}
|
}
|
||||||
#elif defined OP_DOT
|
#elif defined OP_DOT
|
||||||
#define REDUCE_GLOBAL \
|
|
||||||
int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \
|
#ifdef HAVE_SRC2_CONT
|
||||||
dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
#define SRC2_INDEX int src2_index = mad24(id, srcTSIZE, src2_offset);
|
||||||
FUNC(accumulator, temp, temp2)
|
|
||||||
#else
|
#else
|
||||||
|
#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if kercn == 1
|
||||||
#define REDUCE_GLOBAL \
|
#define REDUCE_GLOBAL \
|
||||||
dstT temp = convertToDT(loadpix(srcptr + src_index)); \
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp, temp2)
|
||||||
|
#elif kercn == 2
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1)
|
||||||
|
#elif kercn == 4
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||||
|
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||||
|
FUNC(accumulator, temp.s3, temp2.s3)
|
||||||
|
#elif kercn == 8
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||||
|
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||||
|
FUNC(accumulator, temp.s3, temp2.s3); \
|
||||||
|
FUNC(accumulator, temp.s4, temp2.s4); \
|
||||||
|
FUNC(accumulator, temp.s5, temp2.s5); \
|
||||||
|
FUNC(accumulator, temp.s6, temp2.s6); \
|
||||||
|
FUNC(accumulator, temp.s7, temp2.s7)
|
||||||
|
#elif kercn == 16
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||||
|
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||||
|
FUNC(accumulator, temp.s3, temp2.s3); \
|
||||||
|
FUNC(accumulator, temp.s4, temp2.s4); \
|
||||||
|
FUNC(accumulator, temp.s5, temp2.s5); \
|
||||||
|
FUNC(accumulator, temp.s6, temp2.s6); \
|
||||||
|
FUNC(accumulator, temp.s7, temp2.s7); \
|
||||||
|
FUNC(accumulator, temp.s8, temp2.s8); \
|
||||||
|
FUNC(accumulator, temp.s9, temp2.s9); \
|
||||||
|
FUNC(accumulator, temp.sA, temp2.sA); \
|
||||||
|
FUNC(accumulator, temp.sB, temp2.sB); \
|
||||||
|
FUNC(accumulator, temp.sC, temp2.sC); \
|
||||||
|
FUNC(accumulator, temp.sD, temp2.sD); \
|
||||||
|
FUNC(accumulator, temp.sE, temp2.sE); \
|
||||||
|
FUNC(accumulator, temp.sF, temp2.sF)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
#if kercn == 1
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
|
||||||
FUNC(accumulator, temp)
|
FUNC(accumulator, temp)
|
||||||
|
#elif kercn == 2
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
|
||||||
|
FUNC(accumulator, temp.s0); \
|
||||||
|
FUNC(accumulator, temp.s1)
|
||||||
|
#elif kercn == 4
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
|
||||||
|
FUNC(accumulator, temp.s0); \
|
||||||
|
FUNC(accumulator, temp.s1); \
|
||||||
|
FUNC(accumulator, temp.s2); \
|
||||||
|
FUNC(accumulator, temp.s3)
|
||||||
|
#elif kercn == 8
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
|
||||||
|
FUNC(accumulator, temp.s0); \
|
||||||
|
FUNC(accumulator, temp.s1); \
|
||||||
|
FUNC(accumulator, temp.s2); \
|
||||||
|
FUNC(accumulator, temp.s3); \
|
||||||
|
FUNC(accumulator, temp.s4); \
|
||||||
|
FUNC(accumulator, temp.s5); \
|
||||||
|
FUNC(accumulator, temp.s6); \
|
||||||
|
FUNC(accumulator, temp.s7)
|
||||||
|
#elif kercn == 16
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)); \
|
||||||
|
FUNC(accumulator, temp.s0); \
|
||||||
|
FUNC(accumulator, temp.s1); \
|
||||||
|
FUNC(accumulator, temp.s2); \
|
||||||
|
FUNC(accumulator, temp.s3); \
|
||||||
|
FUNC(accumulator, temp.s4); \
|
||||||
|
FUNC(accumulator, temp.s5); \
|
||||||
|
FUNC(accumulator, temp.s6); \
|
||||||
|
FUNC(accumulator, temp.s7); \
|
||||||
|
FUNC(accumulator, temp.s8); \
|
||||||
|
FUNC(accumulator, temp.s9); \
|
||||||
|
FUNC(accumulator, temp.sA); \
|
||||||
|
FUNC(accumulator, temp.sB); \
|
||||||
|
FUNC(accumulator, temp.sC); \
|
||||||
|
FUNC(accumulator, temp.sD); \
|
||||||
|
FUNC(accumulator, temp.sE); \
|
||||||
|
FUNC(accumulator, temp.sF)
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define SET_LOCAL_1 \
|
#define SET_LOCAL_1 \
|
||||||
@ -163,9 +277,54 @@
|
|||||||
__local dstT localmem[WGS2_ALIGNED]
|
__local dstT localmem[WGS2_ALIGNED]
|
||||||
#define DEFINE_ACCUMULATOR \
|
#define DEFINE_ACCUMULATOR \
|
||||||
dstT accumulator = (dstT)(0); \
|
dstT accumulator = (dstT)(0); \
|
||||||
srcT zero = (srcT)(0), one = (srcT)(1)
|
srcT1 zero = (srcT1)(0), one = (srcT1)(1)
|
||||||
|
#if kercn == 1
|
||||||
#define REDUCE_GLOBAL \
|
#define REDUCE_GLOBAL \
|
||||||
accumulator += loadpix(srcptr + src_index) == zero ? zero : one
|
accumulator += loadpix(srcptr + src_index) == zero ? zero : one
|
||||||
|
#elif kercn == 2
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
srcT value = loadpix(srcptr + src_index); \
|
||||||
|
accumulator += value.s0 == zero ? zero : one; \
|
||||||
|
accumulator += value.s1 == zero ? zero : one
|
||||||
|
#elif kercn == 4
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
srcT value = loadpix(srcptr + src_index); \
|
||||||
|
accumulator += value.s0 == zero ? zero : one; \
|
||||||
|
accumulator += value.s1 == zero ? zero : one; \
|
||||||
|
accumulator += value.s2 == zero ? zero : one; \
|
||||||
|
accumulator += value.s3 == zero ? zero : one
|
||||||
|
#elif kercn == 8
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
srcT value = loadpix(srcptr + src_index); \
|
||||||
|
accumulator += value.s0 == zero ? zero : one; \
|
||||||
|
accumulator += value.s1 == zero ? zero : one; \
|
||||||
|
accumulator += value.s2 == zero ? zero : one; \
|
||||||
|
accumulator += value.s3 == zero ? zero : one; \
|
||||||
|
accumulator += value.s4 == zero ? zero : one; \
|
||||||
|
accumulator += value.s5 == zero ? zero : one; \
|
||||||
|
accumulator += value.s6 == zero ? zero : one; \
|
||||||
|
accumulator += value.s7 == zero ? zero : one
|
||||||
|
#elif kercn == 16
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
srcT value = loadpix(srcptr + src_index); \
|
||||||
|
accumulator += value.s0 == zero ? zero : one; \
|
||||||
|
accumulator += value.s1 == zero ? zero : one; \
|
||||||
|
accumulator += value.s2 == zero ? zero : one; \
|
||||||
|
accumulator += value.s3 == zero ? zero : one; \
|
||||||
|
accumulator += value.s4 == zero ? zero : one; \
|
||||||
|
accumulator += value.s5 == zero ? zero : one; \
|
||||||
|
accumulator += value.s6 == zero ? zero : one; \
|
||||||
|
accumulator += value.s7 == zero ? zero : one; \
|
||||||
|
accumulator += value.s8 == zero ? zero : one; \
|
||||||
|
accumulator += value.s9 == zero ? zero : one; \
|
||||||
|
accumulator += value.sA == zero ? zero : one; \
|
||||||
|
accumulator += value.sB == zero ? zero : one; \
|
||||||
|
accumulator += value.sC == zero ? zero : one; \
|
||||||
|
accumulator += value.sD == zero ? zero : one; \
|
||||||
|
accumulator += value.sE == zero ? zero : one; \
|
||||||
|
accumulator += value.sF == zero ? zero : one
|
||||||
|
#endif
|
||||||
|
|
||||||
#define SET_LOCAL_1 \
|
#define SET_LOCAL_1 \
|
||||||
localmem[lid] = accumulator
|
localmem[lid] = accumulator
|
||||||
#define REDUCE_LOCAL_1 \
|
#define REDUCE_LOCAL_1 \
|
||||||
@ -183,7 +342,7 @@
|
|||||||
#define DEFINE_ACCUMULATOR \
|
#define DEFINE_ACCUMULATOR \
|
||||||
srcT maxval = MIN_VAL, temp
|
srcT maxval = MIN_VAL, temp
|
||||||
#define REDUCE_GLOBAL \
|
#define REDUCE_GLOBAL \
|
||||||
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
|
MASK_INDEX; \
|
||||||
if (mask[mask_index]) \
|
if (mask[mask_index]) \
|
||||||
{ \
|
{ \
|
||||||
temp = loadpix(srcptr + src_index); \
|
temp = loadpix(srcptr + src_index); \
|
||||||
@ -270,7 +429,7 @@
|
|||||||
#define REDUCE_GLOBAL \
|
#define REDUCE_GLOBAL \
|
||||||
temp = loadpix(srcptr + src_index); \
|
temp = loadpix(srcptr + src_index); \
|
||||||
temploc = id; \
|
temploc = id; \
|
||||||
int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \
|
MASK_INDEX; \
|
||||||
__global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
|
__global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
|
||||||
temp_mask = mask[0]; \
|
temp_mask = mask[0]; \
|
||||||
srcT temp_minval = minval, temp_maxval = maxval; \
|
srcT temp_minval = minval, temp_maxval = maxval; \
|
||||||
@ -303,14 +462,20 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
|
|||||||
{
|
{
|
||||||
int lid = get_local_id(0);
|
int lid = get_local_id(0);
|
||||||
int gid = get_group_id(0);
|
int gid = get_group_id(0);
|
||||||
int id = get_global_id(0);
|
int id = get_global_id(0) * kercn;
|
||||||
|
|
||||||
|
srcptr += src_offset;
|
||||||
|
|
||||||
DECLARE_LOCAL_MEM;
|
DECLARE_LOCAL_MEM;
|
||||||
DEFINE_ACCUMULATOR;
|
DEFINE_ACCUMULATOR;
|
||||||
|
|
||||||
for (int grain = groupnum * WGS; id < total; id += grain)
|
for (int grain = groupnum * WGS * kercn; id < total; id += grain)
|
||||||
{
|
{
|
||||||
int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset));
|
#ifdef HAVE_SRC_CONT
|
||||||
|
int src_index = mul24(id, srcTSIZE);
|
||||||
|
#else
|
||||||
|
int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE));
|
||||||
|
#endif
|
||||||
REDUCE_GLOBAL;
|
REDUCE_GLOBAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -473,8 +473,11 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
|
|||||||
{
|
{
|
||||||
CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
|
CV_Assert(sum_op == OCL_OP_SUM || sum_op == OCL_OP_SUM_ABS || sum_op == OCL_OP_SUM_SQR);
|
||||||
|
|
||||||
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
haveMask = _mask.kind() != _InputArray::NONE;
|
||||||
|
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
|
||||||
|
kercn = cn == 1 && !haveMask ? ocl::predictOptimalVectorWidth(_src) : 1,
|
||||||
|
mcn = std::max(cn, kercn);
|
||||||
|
|
||||||
if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
|
if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
|
||||||
return false;
|
return false;
|
||||||
@ -484,7 +487,6 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
|
|||||||
|
|
||||||
int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
|
int ddepth = std::max(sum_op == OCL_OP_SUM_SQR ? CV_32F : CV_32S, depth),
|
||||||
dtype = CV_MAKE_TYPE(ddepth, cn);
|
dtype = CV_MAKE_TYPE(ddepth, cn);
|
||||||
bool haveMask = _mask.kind() != _InputArray::NONE;
|
|
||||||
CV_Assert(!haveMask || _mask.type() == CV_8UC1);
|
CV_Assert(!haveMask || _mask.type() == CV_8UC1);
|
||||||
|
|
||||||
int wgs2_aligned = 1;
|
int wgs2_aligned = 1;
|
||||||
@ -494,15 +496,19 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
|
|||||||
|
|
||||||
static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
|
static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
|
||||||
char cvt[40];
|
char cvt[40];
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
String opts = format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
|
||||||
format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
|
" -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s%s%s -D kercn=%d",
|
||||||
" -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
|
ocl::typeToStr(CV_MAKE_TYPE(depth, mcn)), ocl::typeToStr(depth),
|
||||||
ocl::typeToStr(type), ocl::typeToStr(depth),
|
ocl::typeToStr(dtype), ocl::typeToStr(CV_MAKE_TYPE(ddepth, mcn)),
|
||||||
ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
|
ocl::typeToStr(ddepth), ddepth, cn,
|
||||||
ocl::convertTypeStr(depth, ddepth, cn, cvt),
|
ocl::convertTypeStr(depth, ddepth, mcn, cvt),
|
||||||
opMap[sum_op], (int)wgs, wgs2_aligned,
|
opMap[sum_op], (int)wgs, wgs2_aligned,
|
||||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||||
haveMask ? " -D HAVE_MASK" : ""));
|
haveMask ? " -D HAVE_MASK" : "",
|
||||||
|
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
|
||||||
|
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn);
|
||||||
|
|
||||||
|
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -643,7 +649,7 @@ namespace cv {
|
|||||||
|
|
||||||
static bool ocl_countNonZero( InputArray _src, int & res )
|
static bool ocl_countNonZero( InputArray _src, int & res )
|
||||||
{
|
{
|
||||||
int type = _src.type(), depth = CV_MAT_DEPTH(type);
|
int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = ocl::predictOptimalVectorWidth(_src);
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||||
|
|
||||||
if (depth == CV_64F && !doubleSupport)
|
if (depth == CV_64F && !doubleSupport)
|
||||||
@ -658,9 +664,12 @@ static bool ocl_countNonZero( InputArray _src, int & res )
|
|||||||
wgs2_aligned >>= 1;
|
wgs2_aligned >>= 1;
|
||||||
|
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
||||||
format("-D srcT=%s -D OP_COUNT_NON_ZERO -D WGS=%d -D WGS2_ALIGNED=%d%s",
|
format("-D srcT=%s -D srcT1=%s -D cn=1 -D OP_COUNT_NON_ZERO"
|
||||||
ocl::typeToStr(type), (int)wgs,
|
" -D WGS=%d -D kercn=%d -D WGS2_ALIGNED=%d%s%s",
|
||||||
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
|
||||||
|
ocl::typeToStr(depth), (int)wgs, kercn,
|
||||||
|
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||||
|
_src.isContinuous() ? " -D HAVE_SRC_CONT" : ""));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
@ -1349,7 +1358,7 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
|
|||||||
CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
|
CV_Assert( (_src.channels() == 1 && (_mask.empty() || _mask.type() == CV_8U)) ||
|
||||||
(_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
|
(_src.channels() >= 1 && _mask.empty() && !minLoc && !maxLoc) );
|
||||||
|
|
||||||
int type = _src.type(), depth = CV_MAT_DEPTH(type);
|
int type = _src.type(), depth = CV_MAT_DEPTH(type), kercn = 1;
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||||
|
|
||||||
if (depth == CV_64F && !doubleSupport)
|
if (depth == CV_64F && !doubleSupport)
|
||||||
@ -1363,9 +1372,12 @@ static bool ocl_minMaxIdx( InputArray _src, double* minVal, double* maxVal, int*
|
|||||||
wgs2_aligned <<= 1;
|
wgs2_aligned <<= 1;
|
||||||
wgs2_aligned >>= 1;
|
wgs2_aligned >>= 1;
|
||||||
|
|
||||||
String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d -D WGS2_ALIGNED=%d%s",
|
String opts = format("-D DEPTH_%d -D srcT=%s -D OP_MIN_MAX_LOC%s -D WGS=%d"
|
||||||
|
" -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
|
||||||
depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs,
|
depth, ocl::typeToStr(depth), _mask.empty() ? "" : "_MASK", (int)wgs,
|
||||||
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||||
|
_src.isContinuous() ? " -D HAVE_SRC_CONT" : "",
|
||||||
|
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : "", kercn);
|
||||||
|
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
|
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, opts);
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
@ -2090,9 +2102,11 @@ static bool ocl_norm( InputArray _src, int normType, InputArray _mask, double &
|
|||||||
|
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
||||||
format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d"
|
format("-D OP_NORM_INF_MASK -D HAVE_MASK -D DEPTH_%d"
|
||||||
" -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s",
|
" -D srcT=%s -D srcT1=%s -D WGS=%d -D cn=%d -D WGS2_ALIGNED=%d%s%s%s",
|
||||||
depth, ocl::typeToStr(type), ocl::typeToStr(depth),
|
depth, ocl::typeToStr(type), ocl::typeToStr(depth),
|
||||||
wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
wgs, cn, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||||
|
src.isContinuous() ? " -D HAVE_CONT_SRC" : "",
|
||||||
|
_mask.isContinuous() ? " -D HAVE_MASK_CONT" : ""));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -841,7 +841,10 @@ UMat UMat::mul(InputArray m, double scale) const
|
|||||||
|
|
||||||
static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
|
static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
|
||||||
{
|
{
|
||||||
int type = _src1.type(), depth = CV_MAT_DEPTH(type);
|
UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
|
||||||
|
|
||||||
|
int type = src1.type(), depth = CV_MAT_DEPTH(type),
|
||||||
|
kercn = ocl::predictOptimalVectorWidth(src1, src2);
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||||
|
|
||||||
if ( !doubleSupport && depth == CV_64F )
|
if ( !doubleSupport && depth == CV_64F )
|
||||||
@ -858,13 +861,18 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
|
|||||||
|
|
||||||
char cvt[40];
|
char cvt[40];
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
||||||
format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT -D WGS=%d -D WGS2_ALIGNED=%d%s",
|
format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
|
||||||
ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth, ocl::convertTypeStr(depth, ddepth, 1, cvt),
|
"-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
|
||||||
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
|
||||||
|
ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
|
||||||
|
ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
|
||||||
|
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||||
|
_src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
|
||||||
|
_src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1), db(1, dbsize, ddepth);
|
UMat db(1, dbsize, ddepth);
|
||||||
|
|
||||||
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
|
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
|
||||||
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user