Optimize openCL version of reduce function
This commit is contained in:
parent
ab2749d648
commit
169351b01d
@ -3428,11 +3428,60 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst,
|
|||||||
const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
|
const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG",
|
||||||
"OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
|
"OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" };
|
||||||
char cvt[40];
|
char cvt[40];
|
||||||
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc,
|
|
||||||
format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
|
const int min_opt_cols = 128;
|
||||||
ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
|
if ((1 == dim) && (_src.cols() > min_opt_cols))
|
||||||
ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
|
{
|
||||||
doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
|
int buf_cols = 32;
|
||||||
|
|
||||||
|
cv::String build_opt_pre = format("-D BUF_COLS=%d -D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
|
||||||
|
buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
|
||||||
|
ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
|
||||||
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||||
|
ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre);
|
||||||
|
if (kpre.empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cv::String build_opt_main = format("-D %s -D dim=1 -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=noconvert%s",
|
||||||
|
ops[op], cn, ddepth, ocl::typeToStr(ddepth), ocl::typeToStr(ddepth),
|
||||||
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||||
|
ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt_main);
|
||||||
|
if (kmain.empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
UMat src = _src.getUMat();
|
||||||
|
Size dsize(1, src.rows);
|
||||||
|
_dst.create(dsize, dtype);
|
||||||
|
UMat dst = _dst.getUMat(), temp = dst;
|
||||||
|
|
||||||
|
if (op0 == CV_REDUCE_AVG && sdepth < CV_32S && ddepth0 < CV_32S)
|
||||||
|
temp.create(dsize, CV_32SC(cn));
|
||||||
|
|
||||||
|
UMat buf(src.rows, buf_cols, temp.type());
|
||||||
|
|
||||||
|
size_t globalSize[2] = { buf_cols, src.rows };
|
||||||
|
|
||||||
|
kpre.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf));
|
||||||
|
if (!kpre.run(2, globalSize, NULL, false))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
globalSize[0] = src.rows;
|
||||||
|
kmain.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnlyNoSize(temp));
|
||||||
|
if (!kmain.run(1, globalSize, NULL, false))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (op0 == CV_REDUCE_AVG)
|
||||||
|
temp.convertTo(dst, ddepth0, 1. / (dim == 0 ? src.rows : src.cols));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s",
|
||||||
|
ops[op], dim, cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth),
|
||||||
|
ocl::convertTypeStr(sdepth, ddepth, 1, cvt),
|
||||||
|
doubleSupport ? " -D DOUBLE_SUPPORT" : "");
|
||||||
|
|
||||||
|
ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt);
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
@ -91,6 +91,41 @@
|
|||||||
#error "No operation is specified"
|
#error "No operation is specified"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef BUF_COLS
|
||||||
|
#define BUF_COLS 64
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||||
|
__global uchar * bufptr, int buf_step, int buf_offset)
|
||||||
|
{
|
||||||
|
int x = get_global_id(0);
|
||||||
|
int y = get_global_id(1);
|
||||||
|
if (x < BUF_COLS)
|
||||||
|
{
|
||||||
|
int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset));
|
||||||
|
int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset));
|
||||||
|
|
||||||
|
__global const srcT * src = (__global const srcT *)(srcptr + src_index);
|
||||||
|
__global dstT * buf = (__global dstT *)(bufptr + buf_index);
|
||||||
|
dstT tmp[cn] = { INIT_VALUE };
|
||||||
|
|
||||||
|
int src_step_mul = BUF_COLS * cn;
|
||||||
|
for (int x = 0; x < cols; x += BUF_COLS, src += src_step_mul)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
{
|
||||||
|
dstT value = convertToDT(src[c]);
|
||||||
|
PROCESS_ELEM(tmp[c], value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int c = 0; c < cn; ++c)
|
||||||
|
buf[c] = tmp[c];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
__kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols,
|
||||||
__global uchar * dstptr, int dst_step, int dst_offset)
|
__global uchar * dstptr, int dst_step, int dst_offset)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user