From a350b76738772e286b50f9b35e01ea84a3789757 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Wed, 9 Jul 2014 19:00:33 +0400 Subject: [PATCH] optimization of cv::accumulate** --- modules/imgproc/src/accum.cpp | 26 +++++++---- modules/imgproc/src/opencl/accumulate.cl | 59 +++++++++++++++++------- 2 files changed, 59 insertions(+), 26 deletions(-) diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp index f2a47e3d3..398740576 100644 --- a/modules/imgproc/src/accum.cpp +++ b/modules/imgproc/src/accum.cpp @@ -369,11 +369,17 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE || op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED); - int stype = _src.type(), cn = CV_MAT_CN(stype); - int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth(); + const ocl::Device & dev = ocl::Device::getDefault(); + int vectorWidths[] = { 4, 4, 2, 2, 1, 1, 1, -1 }; + int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth(); + int pcn = std::max(vectorWidths[sdepth], vectorWidths[ddepth]), sesz = CV_ELEM_SIZE(sdepth) * pcn, + desz = CV_ELEM_SIZE(ddepth) * pcn, rowsPerWI = dev.isIntel() ? 4 : 1; - bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0, - haveMask = !_mask.empty(); + bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(), + usepcn = _src.offset() % sesz == 0 && _src.step() % sesz == 0 && (_src.cols() * cn) % pcn == 0 && + _src2.offset() % desz == 0 && _src2.step() % desz == 0 && + _dst.offset() % pcn == 0 && _dst.step() % desz == 0 && !haveMask; + int kercn = usepcn ? pcn : haveMask ? cn : 1; if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) return false; @@ -381,11 +387,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT", "ACCUMULATE_WEIGHTED" }; + char cvt[40]; ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc, - format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s", + format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s", opMap[op_type], haveMask ? " -D HAVE_MASK" : "", - ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth), - doubleSupport ? " -D DOUBLE_SUPPORT" : "")); + ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth), + doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI, + ocl::convertTypeStr(sdepth, ddepth, 1, cvt))); if (k.empty()) return false; @@ -393,7 +401,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), src2arg = ocl::KernelArg::ReadOnlyNoSize(src2), - dstarg = ocl::KernelArg::ReadWrite(dst), + dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn), maskarg = ocl::KernelArg::ReadOnlyNoSize(mask); int argidx = k.set(0, srcarg); @@ -410,7 +418,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray if (haveMask) k.set(argidx, maskarg); - size_t globalsize[2] = { src.cols, src.rows }; + size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI }; return k.run(2, globalsize, NULL, false); } diff --git a/modules/imgproc/src/opencl/accumulate.cl b/modules/imgproc/src/opencl/accumulate.cl index a60d4d6d9..f786f8038 100644 --- a/modules/imgproc/src/opencl/accumulate.cl +++ b/modules/imgproc/src/opencl/accumulate.cl @@ -13,13 +13,18 @@ #endif #endif +#define SRC_TSIZE cn * (int)sizeof(srcT1) +#define DST_TSIZE cn * (int)sizeof(dstT1) + +#define noconvert + __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset, #ifdef ACCUMULATE_PRODUCT __global const uchar * src2ptr, int src2_step, int src2_offset, #endif __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols #ifdef ACCUMULATE_WEIGHTED - , dstT alpha + , dstT1 alpha #endif #ifdef HAVE_MASK , __global const uchar * mask, int mask_step, int mask_offset @@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of ) { int x = get_global_id(0); - int y = get_global_id(1); + int y = get_global_id(1) * rowsPerWI; - if (x < dst_cols && y < dst_rows) + if (x < dst_cols) { - int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT)); + int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset)); #ifdef HAVE_MASK int mask_index = mad24(y, mask_step, mask_offset + x); mask += mask_index; #endif - int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT)); - - __global const srcT * src = (__global const srcT *)(srcptr + src_index); #ifdef ACCUMULATE_PRODUCT - int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT)); - __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); + int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset)); #endif - __global dstT * dst = (__global dstT *)(dstptr + dst_index); + int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset)); #pragma unroll - for (int c = 0; c < cn; ++c) -#ifdef HAVE_MASK - if (mask[0]) + for (int i = 0; i < rowsPerWI; ++i) + if (y < dst_rows) + { + __global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index); +#ifdef ACCUMULATE_PRODUCT + __global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index); #endif + __global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index); + +#ifdef HAVE_MASK + if (mask[0]) +#endif + #pragma unroll + for (int c = 0; c < cn; ++c) + { #ifdef ACCUMULATE - dst[c] += src[c]; + dst[c] += convertToDT(src[c]); #elif defined ACCUMULATE_SQUARE - dst[c] += src[c] * src[c]; + dstT1 val = convertToDT(src[c]); + dst[c] = fma(val, val, dst[c]); #elif defined ACCUMULATE_PRODUCT - dst[c] += src[c] * src2[c]; + dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]); #elif defined ACCUMULATE_WEIGHTED - dst[c] = (1 - alpha) * dst[c] + src[c] * alpha; + dst[c] = fma(1 - alpha, dst[c], src[c] * alpha); #else #error "Unknown accumulation type" #endif + } + + src_index += src_step; +#ifdef ACCUMULATE_PRODUCT + src2_index += src2_step; +#endif +#ifdef HAVE_MASK + mask += mask_step; +#endif + dst_index += dst_step; + ++y; + } } }