optimized cv::UMat::dot
This commit is contained in:
parent
579499d900
commit
c52a77b90f
@ -158,10 +158,59 @@
|
|||||||
#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))
|
#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if kercn == 1
|
||||||
#define REDUCE_GLOBAL \
|
#define REDUCE_GLOBAL \
|
||||||
SRC2_INDEX; \
|
SRC2_INDEX; \
|
||||||
dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
FUNC(accumulator, temp, temp2)
|
FUNC(accumulator, temp, temp2)
|
||||||
|
#elif kercn == 2
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1)
|
||||||
|
#elif kercn == 4
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||||
|
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||||
|
FUNC(accumulator, temp.s3, temp2.s3)
|
||||||
|
#elif kercn == 8
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||||
|
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||||
|
FUNC(accumulator, temp.s3, temp2.s3); \
|
||||||
|
FUNC(accumulator, temp.s4, temp2.s4); \
|
||||||
|
FUNC(accumulator, temp.s5, temp2.s5); \
|
||||||
|
FUNC(accumulator, temp.s6, temp2.s6); \
|
||||||
|
FUNC(accumulator, temp.s7, temp2.s7)
|
||||||
|
#elif kercn == 16
|
||||||
|
#define REDUCE_GLOBAL \
|
||||||
|
SRC2_INDEX; \
|
||||||
|
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||||
|
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||||
|
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||||
|
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||||
|
FUNC(accumulator, temp.s3, temp2.s3); \
|
||||||
|
FUNC(accumulator, temp.s4, temp2.s4); \
|
||||||
|
FUNC(accumulator, temp.s5, temp2.s5); \
|
||||||
|
FUNC(accumulator, temp.s6, temp2.s6); \
|
||||||
|
FUNC(accumulator, temp.s7, temp2.s7); \
|
||||||
|
FUNC(accumulator, temp.s8, temp2.s8); \
|
||||||
|
FUNC(accumulator, temp.s9, temp2.s9); \
|
||||||
|
FUNC(accumulator, temp.sA, temp2.sA); \
|
||||||
|
FUNC(accumulator, temp.sB, temp2.sB); \
|
||||||
|
FUNC(accumulator, temp.sC, temp2.sC); \
|
||||||
|
FUNC(accumulator, temp.sD, temp2.sD); \
|
||||||
|
FUNC(accumulator, temp.sE, temp2.sE); \
|
||||||
|
FUNC(accumulator, temp.sF, temp2.sF)
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#if kercn == 1
|
#if kercn == 1
|
||||||
#define REDUCE_GLOBAL \
|
#define REDUCE_GLOBAL \
|
||||||
|
@ -836,7 +836,10 @@ UMat UMat::mul(InputArray m, double scale) const
|
|||||||
|
|
||||||
static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
|
static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
|
||||||
{
|
{
|
||||||
int type = _src1.type(), depth = CV_MAT_DEPTH(type), kercn = 1;
|
UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
|
||||||
|
|
||||||
|
int type = src1.type(), depth = CV_MAT_DEPTH(type),
|
||||||
|
kercn = ocl::predictOptimalVectorWidth(src1, src2);
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
||||||
|
|
||||||
if ( !doubleSupport && depth == CV_64F )
|
if ( !doubleSupport && depth == CV_64F )
|
||||||
@ -853,17 +856,18 @@ static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
|
|||||||
|
|
||||||
char cvt[40];
|
char cvt[40];
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
|
||||||
format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
|
format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
|
||||||
"-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
|
"-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
|
||||||
ocl::typeToStr(depth), ocl::typeToStr(ddepth), ddepth,
|
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
|
||||||
ocl::convertTypeStr(depth, ddepth, 1, cvt),
|
ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
|
||||||
|
ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
|
||||||
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
|
||||||
_src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
|
_src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
|
||||||
_src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
|
_src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1), db(1, dbsize, ddepth);
|
UMat db(1, dbsize, ddepth);
|
||||||
|
|
||||||
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
|
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
|
||||||
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user