diff --git a/modules/core/src/matrix.cpp b/modules/core/src/matrix.cpp index f199cb253..0db08e884 100644 --- a/modules/core/src/matrix.cpp +++ b/modules/core/src/matrix.cpp @@ -3441,8 +3441,11 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst, const int min_opt_cols = 128, buf_cols = 32; int sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = CV_MAT_DEPTH(dtype), ddepth0 = ddepth; - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, - useOptimized = 1 == dim && _src.cols() > min_opt_cols; + const ocl::Device &defDev = ocl::Device::getDefault(); + bool doubleSupport = defDev.doubleFPConfig() > 0; + + size_t wgs = defDev.maxWorkGroupSize(); + bool useOptimized = 1 == dim && _src.cols() > min_opt_cols && (wgs >= buf_cols); if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) return false; @@ -3455,78 +3458,80 @@ static bool ocl_reduce(InputArray _src, OutputArray _dst, const char * const ops[4] = { "OCL_CV_REDUCE_SUM", "OCL_CV_REDUCE_AVG", "OCL_CV_REDUCE_MAX", "OCL_CV_REDUCE_MIN" }; - char cvt[2][40]; - int wdepth = std::max(ddepth, CV_32F); - cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d" - " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s" - " -D convertToDT=%s -D convertToDT0=%s%s", - ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth), - ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0), - ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]), - ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]), - ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - if (useOptimized) { - cv::String build_opt_pre = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D %s -D dim=1" - " -D cn=%d -D ddepth=%d -D srcT=%s -D dstT=%s -D convertToDT=%s%s", - buf_cols, ops[op], cn, ddepth, ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), - ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]), - doubleSupport ? " -D DOUBLE_SUPPORT" : ""); - ocl::Kernel kpre("reduce_horz_pre", ocl::core::reduce2_oclsrc, build_opt_pre); - if (kpre.empty()) + size_t tileHeight = (size_t)(wgs / buf_cols); + if (defDev.isIntel()) + { + static const size_t maxItemInGroupCount = 16; + tileHeight = min(tileHeight, defDev.localMemSize() / buf_cols / CV_ELEM_SIZE(CV_MAKETYPE(wdepth, cn)) / maxItemInGroupCount); + } + char cvt[3][40]; + cv::String build_opt = format("-D OP_REDUCE_PRE -D BUF_COLS=%d -D TILE_HEIGHT=%d -D %s -D dim=1" + " -D cn=%d -D ddepth=%d" + " -D srcT=%s -D bufT=%s -D dstT=%s" + " -D convertToWT=%s -D convertToBufT=%s -D convertToDT=%s%s", + buf_cols, tileHeight, ops[op], cn, ddepth, + ocl::typeToStr(sdepth), + ocl::typeToStr(ddepth), + ocl::typeToStr(ddepth0), + ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]), + ocl::convertTypeStr(sdepth, ddepth, 1, cvt[1]), + ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[2]), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + ocl::Kernel k("reduce_horz_opt", ocl::core::reduce2_oclsrc, build_opt); + if (k.empty()) return false; - - ocl::Kernel kmain("reduce", ocl::core::reduce2_oclsrc, build_opt); - if (kmain.empty()) - return false; - UMat src = _src.getUMat(); Size dsize(1, src.rows); _dst.create(dsize, dtype); UMat dst = _dst.getUMat(); - UMat buf(src.rows, buf_cols, dst.type()); - - kpre.args(ocl::KernelArg::ReadOnly(src), - ocl::KernelArg::WriteOnlyNoSize(buf)); + if (op0 == CV_REDUCE_AVG) + k.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols); + else + k.args(ocl::KernelArg::ReadOnly(src), + ocl::KernelArg::WriteOnlyNoSize(dst)); + size_t localSize[2] = { buf_cols, tileHeight}; size_t globalSize[2] = { buf_cols, src.rows }; - if (!kpre.run(2, globalSize, NULL, false)) + return k.run(2, globalSize, localSize, false); + } + else + { + char cvt[2][40]; + cv::String build_opt = format("-D %s -D dim=%d -D cn=%d -D ddepth=%d" + " -D srcT=%s -D dstT=%s -D dstT0=%s -D convertToWT=%s" + " -D convertToDT=%s -D convertToDT0=%s%s", + ops[op], dim, cn, ddepth, ocl::typeToStr(useOptimized ? ddepth : sdepth), + ocl::typeToStr(ddepth), ocl::typeToStr(ddepth0), + ocl::convertTypeStr(ddepth, wdepth, 1, cvt[0]), + ocl::convertTypeStr(sdepth, ddepth, 1, cvt[0]), + ocl::convertTypeStr(wdepth, ddepth0, 1, cvt[1]), + doubleSupport ? " -D DOUBLE_SUPPORT" : ""); + + ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt); + if (k.empty()) return false; + UMat src = _src.getUMat(); + Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows); + _dst.create(dsize, dtype); + UMat dst = _dst.getUMat(); + + ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), + temparg = ocl::KernelArg::WriteOnlyNoSize(dst); + if (op0 == CV_REDUCE_AVG) - kmain.args(ocl::KernelArg::ReadOnly(buf), - ocl::KernelArg::WriteOnlyNoSize(dst), 1.0f / src.cols); + k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols)); else - kmain.args(ocl::KernelArg::ReadOnly(buf), - ocl::KernelArg::WriteOnlyNoSize(dst)); + k.args(srcarg, temparg); - globalSize[0] = src.rows; - return kmain.run(1, globalSize, NULL, false); + size_t globalsize = std::max(dsize.width, dsize.height); + return k.run(1, &globalsize, NULL, false); } - - ocl::Kernel k("reduce", ocl::core::reduce2_oclsrc, build_opt); - if (k.empty()) - return false; - - UMat src = _src.getUMat(); - Size dsize(dim == 0 ? src.cols : 1, dim == 0 ? 1 : src.rows); - _dst.create(dsize, dtype); - UMat dst = _dst.getUMat(); - - ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), - temparg = ocl::KernelArg::WriteOnlyNoSize(dst); - - if (op0 == CV_REDUCE_AVG) - k.args(srcarg, temparg, 1.0f / (dim == 0 ? src.rows : src.cols)); - else - k.args(srcarg, temparg); - - size_t globalsize = std::max(dsize.width, dsize.height); - return k.run(1, &globalsize, NULL, false); } } diff --git a/modules/core/src/opencl/reduce2.cl b/modules/core/src/opencl/reduce2.cl index 7800e7a74..457378cc1 100644 --- a/modules/core/src/opencl/reduce2.cl +++ b/modules/core/src/opencl/reduce2.cl @@ -81,29 +81,34 @@ #define PROCESS_ELEM(acc, value) acc += value #elif defined OCL_CV_REDUCE_MAX #define INIT_VALUE MIN_VAL -#define PROCESS_ELEM(acc, value) acc = value > acc ? value : acc +#define PROCESS_ELEM(acc, value) acc = max(value, acc) #elif defined OCL_CV_REDUCE_MIN #define INIT_VALUE MAX_VAL -#define PROCESS_ELEM(acc, value) acc = value < acc ? value : acc +#define PROCESS_ELEM(acc, value) acc = min(value, acc) #else #error "No operation is specified" #endif #ifdef OP_REDUCE_PRE -__kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols, - __global uchar * bufptr, int buf_step, int buf_offset) +__kernel void reduce_horz_opt(__global const uchar * srcptr, int src_step, int src_offset, int rows, int cols, + __global uchar * dstptr, int dst_step, int dst_offset +#ifdef OCL_CV_REDUCE_AVG + , float fscale +#endif + ) { + __local bufT lsmem[TILE_HEIGHT][BUF_COLS][cn]; + int x = get_global_id(0); int y = get_global_id(1); - if (x < BUF_COLS) + int liy = get_local_id(1); + if ((x < BUF_COLS) && (y < rows)) { int src_index = mad24(y, src_step, mad24(x, (int)sizeof(srcT) * cn, src_offset)); - int buf_index = mad24(y, buf_step, mad24(x, (int)sizeof(dstT) * cn, buf_offset)); __global const srcT * src = (__global const srcT *)(srcptr + src_index); - __global dstT * buf = (__global dstT *)(bufptr + buf_index); - dstT tmp[cn] = { INIT_VALUE }; + bufT tmp[cn] = { INIT_VALUE }; int src_step_mul = BUF_COLS * cn; for (int idx = x; idx < cols; idx += BUF_COLS, src += src_step_mul) @@ -111,14 +116,49 @@ __kernel void reduce_horz_pre(__global const uchar * srcptr, int src_step, int s #pragma unroll for (int c = 0; c < cn; ++c) { - dstT value = convertToDT(src[c]); + bufT value = convertToBufT(src[c]); PROCESS_ELEM(tmp[c], value); } } #pragma unroll for (int c = 0; c < cn; ++c) - buf[c] = tmp[c]; + lsmem[liy][x][c] = tmp[c]; + } + barrier(CLK_LOCAL_MEM_FENCE); + if ((x < BUF_COLS / 2) && (y < rows)) + { + #pragma unroll + for (int c = 0; c < cn; ++c) + { + PROCESS_ELEM(lsmem[liy][x][c], lsmem[liy][x + BUF_COLS / 2][c]); + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if ((x == 0) && (y < rows)) + { + int dst_index = mad24(y, dst_step, dst_offset); + + __global dstT * dst = (__global dstT *)(dstptr + dst_index); + bufT tmp[cn] = { INIT_VALUE }; + + #pragma unroll + for (int xin = 0; xin < BUF_COLS / 2; xin ++) + { + #pragma unroll + for (int c = 0; c < cn; ++c) + { + PROCESS_ELEM(tmp[c], lsmem[liy][xin][c]); + } + } + + #pragma unroll + for (int c = 0; c < cn; ++c) +#ifdef OCL_CV_REDUCE_AVG + dst[c] = convertToDT(convertToWT(tmp[c]) * fscale); +#else + dst[c] = convertToDT(tmp[c]); +#endif } }