merged 2 kernels
This commit is contained in:
parent
c836613bad
commit
26b73a7bbd
@ -3411,76 +3411,76 @@ namespace cv {
|
|||||||
static bool ocl_reduce(InputArray _src, OutputArray _dst,
|
static bool ocl_reduce(InputArray _src, OutputArray _dst,
|
||||||
int dim, int op, int op0, int stype, int dtype)
|
int dim, int op, int op0, int stype, int dtype)
|
||||||
{
|
{
|
||||||
|
const int min_opt_cols = 128, buf_cols = 32;
|
||||||
int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
|
int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
|
||||||
ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
|
ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth;
|
||||||
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
|
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
|
||||||
|
useOptimized = 1 == dim && _src.cols() > min_opt_cols;
|
||||||
|
|
||||||
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (op == CV_REDUCE_AVG)
|
if (op == CV_REDUCE_AVG)
|
||||||
{
|
{
|
||||||
op = CV_REDUCE_SUM;
|
|
||||||
if (sdepth < CV_32S && ddepth < CV_32S)
|
if (sdepth < CV_32S && ddepth < CV_32S)
|
||||||
ddepth = CV_32S;
|
ddepth = CV_32S;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
|
const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
|
||||||
"OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
|
"OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
|
||||||
char cvt[40];
|
char cvt[2][40];
|
||||||
|
|
||||||
const int min_opt_cols = 128;
|
int wdepth = std::max(ddepth, CV_32F);
|
||||||
if ((1 == dim) && (_src.cols() > min_opt_cols))
|
cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d"
|
||||||
|
" -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s"
|
||||||
|
" -D convertToDT=%s -D convertToDT0=%s%s",
|
||||||
|
ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth),
|
||||||
|
ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0),
|
||||||
|
ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]),
|
||||||
|
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
|
||||||
|
ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]),
|
||||||
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||||
|
|
||||||
|
if (useOptimized)
|
||||||
{
|
{
|
||||||
int buf_cols = 32;
|
cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1"
|
||||||
|
" -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
|
||||||
cv::String build_opt_pre = format("-D BUF_COLS=%d -D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
|
buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
|
||||||
buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
|
ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]),
|
||||||
ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
|
||||||
ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
|
ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
|
||||||
if (kpre.empty())
|
if (kpre.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
cv::String build_opt_main = format("-D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=noconvert%s",
|
ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt);
|
||||||
ops[op], cn, ddepth, ocl::typeToStr(ddepth), ocl::typeToStr(ddepth),
|
|
||||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
|
||||||
ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt_main);
|
|
||||||
if (kmain.empty())
|
if (kmain.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
UMat src = _src.getUMat();
|
UMat src = _src.getUMat();
|
||||||
Size dsize(1, src.rows);
|
Size dsize(1, src.rows);
|
||||||
_dst.create(dsize, dtype);
|
_dst.create(dsize, dtype);
|
||||||
UMat dst = _dst.getUMat(), temp = dst;
|
UMat dst = _dst.getUMat();
|
||||||
|
|
||||||
if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S)
|
UMat buf(src.rows, buf_cols, dst.type());
|
||||||
temp.create(dsize, CV_32SC(cn));
|
|
||||||
|
|
||||||
UMat buf(src.rows, buf_cols, temp.type());
|
kpre.args(ocl::KernelArg::ReadOnly(src),
|
||||||
|
ocl::KernelArg::WriteOnlyNoSize(buf));
|
||||||
|
|
||||||
size_t globalSize[2] = { buf_cols, src.rows };
|
size_t globalSize[2] = { buf_cols, src.rows };
|
||||||
|
|
||||||
kpre.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
|
|
||||||
if (!kpre.run(2, globalSize, NULL, false))
|
if (!kpre.run(2, globalSize, NULL, false))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
globalSize[0] = src.rows;
|
|
||||||
kmain.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnlyNoSize(temp));
|
|
||||||
if (!kmain.run(1, globalSize, NULL, false))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (op0 == CV_REDUCE_AVG)
|
if (op0 == CV_REDUCE_AVG)
|
||||||
temp.convertTo(dst, ddepth0, 1. / src.cols);
|
kmain.args(ocl::KernelArg::ReadOnly(buf),
|
||||||
|
ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols);
|
||||||
|
else
|
||||||
|
kmain.args(ocl::KernelArg::ReadOnly(buf),
|
||||||
|
ocl::KernelArg::WriteOnlyNoSize(dst));
|
||||||
|
|
||||||
return true;
|
globalSize[0] = src.rows;
|
||||||
|
return kmain.run(1, globalSize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
|
|
||||||
ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
|
|
||||||
ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
|
|
||||||
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
|
||||||
|
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
|
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
@ -3488,22 +3488,18 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
|
|||||||
UMat src = _src.getUMat();
|
UMat src = _src.getUMat();
|
||||||
Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
|
Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows);
|
||||||
_dst.create(dsize, dtype);
|
_dst.create(dsize, dtype);
|
||||||
UMat dst = _dst.getUMat(), temp = dst;
|
UMat dst = _dst.getUMat();
|
||||||
|
|
||||||
if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S)
|
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src),
|
||||||
temp.create(dsize, CV_32SC(cn));
|
temparg = ocl::KernelArg::WriteOnlyNoSize(dst);
|
||||||
|
|
||||||
size_t globalsize = std::max(dsize.width, dsize.height);
|
|
||||||
|
|
||||||
k.args(ocl::KernelArg::ReadOnly(src),
|
|
||||||
ocl::KernelArg::WriteOnlyNoSize(temp));
|
|
||||||
if (!k.run(1, &globalsize, NULL, false))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (op0 == CV_REDUCE_AVG)
|
if (op0 == CV_REDUCE_AVG)
|
||||||
temp.convertTo(dst, ddepth0, 1. / (dim == 0 ? src.rows : src.cols));
|
k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols));
|
||||||
|
else
|
||||||
|
k.args(srcarg, temparg);
|
||||||
|
|
||||||
return true;
|
size_t globalsize = std::max(dsize.width, dsize.height);
|
||||||
|
return k.run(1, &globalsize, NULL, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -76,24 +76,20 @@
|
|||||||
|
|
||||||
#define noconvert
|
#define noconvert
|
||||||
|
|
||||||
#ifdef OCL_CV_REDUCE_SUM
|
#if defined OCL_CV_REDUCE_SUM || defined OCL_CV_REDUCE_AVG
|
||||||
#define INIT_VALUE 0
|
#define INIT_VALUE 0
|
||||||
#define PROCESS_ELEM(acc, value) acc += value
|
#define PROCESS_ELEM(acc, value) acc += value
|
||||||
#elif defined(OCL_CV_REDUCE_MAX)
|
#elif defined OCL_CV_REDUCE_MAX
|
||||||
#define INIT_VALUE MIN_VAL
|
#define INIT_VALUE MIN_VAL
|
||||||
#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
|
#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc
|
||||||
#elif defined(OCL_CV_REDUCE_MIN)
|
#elif defined OCL_CV_REDUCE_MIN
|
||||||
#define INIT_VALUE MAX_VAL
|
#define INIT_VALUE MAX_VAL
|
||||||
#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
|
#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc
|
||||||
#elif defined(OCL_CV_REDUCE_AVG)
|
|
||||||
#error "This operation should be implemented through OCL_CV_REDUCE_SUM"
|
|
||||||
#else
|
#else
|
||||||
#error "No operation is specified"
|
#error "No operation is specified"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef BUF_COLS
|
#ifdef OP_REDUCE_PRE
|
||||||
#define BUF_COLS 32
|
|
||||||
#endif
|
|
||||||
|
|
||||||
__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||||
__global uchar * bufptr, int buf_step, int buf_offset)
|
__global uchar * bufptr, int buf_step, int buf_offset)
|
||||||
@ -126,15 +122,23 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset)
|
__global uchar * dstptr, int dst_step, int dst_offset
|
||||||
|
#ifdef OCL_CV_REDUCE_AVG
|
||||||
|
, float fscale
|
||||||
|
#endif
|
||||||
|
)
|
||||||
{
|
{
|
||||||
#if dim == 0 // reduce to a single row
|
#if dim == 0 // reduce to a single row
|
||||||
int x = get_global_id(0);
|
int x = get_global_id(0);
|
||||||
if (x < cols)
|
if (x < cols)
|
||||||
{
|
{
|
||||||
int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset);
|
int src_index = mad24(x, (int)sizeof(srcT) * cn, src_offset);
|
||||||
__global dstT * dst = (__global dstT *)(dstptr + dst_offset) + x * cn;
|
int dst_index = mad24(x, (int)sizeof(dstT0) * cn, dst_offset);
|
||||||
|
|
||||||
|
__global dstT0 * dst = (__global dstT0 *)(dstptr + dst_index);
|
||||||
dstT tmp[cn] = { INIT_VALUE };
|
dstT tmp[cn] = { INIT_VALUE };
|
||||||
|
|
||||||
for (int y = 0; y < rows; ++y, src_index += src_step)
|
for (int y = 0; y < rows; ++y, src_index += src_step)
|
||||||
@ -150,7 +154,11 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int c = 0; c < cn; ++c)
|
for (int c = 0; c < cn; ++c)
|
||||||
dst[c] = tmp[c];
|
#ifdef OCL_CV_REDUCE_AVG
|
||||||
|
dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);
|
||||||
|
#else
|
||||||
|
dst[c] = convertToDT0(tmp[c]);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#elif dim == 1 // reduce to a single column
|
#elif dim == 1 // reduce to a single column
|
||||||
int y = get_global_id(0);
|
int y = get_global_id(0);
|
||||||
@ -175,9 +183,15 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
|
|||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int c = 0; c < cn; ++c)
|
for (int c = 0; c < cn; ++c)
|
||||||
dst[c] = tmp[c];
|
#ifdef OCL_CV_REDUCE_AVG
|
||||||
|
dst[c] = convertToDT0(convertToWT(tmp[c]) * fscale);
|
||||||
|
#else
|
||||||
|
dst[c] = convertToDT0(tmp[c]);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#error "Dims must be either 0 or 1"
|
#error "Dims must be either 0 or 1"
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
Loading…
x
Reference in New Issue
Block a user