From a350b76738772e286b50f9b35e01ea84a3789757 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@itseez.com>
Date: Wed, 9 Jul 2014 19:00:33 +0400
Subject: [PATCH] optimization of cv::accumulate**

---
 modules/imgproc/src/accum.cpp            | 26 +++++++----
 modules/imgproc/src/opencl/accumulate.cl | 59 +++++++++++++++++-------
 2 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/modules/imgproc/src/accum.cpp b/modules/imgproc/src/accum.cpp
index f2a47e3d3..398740576 100644
--- a/modules/imgproc/src/accum.cpp
+++ b/modules/imgproc/src/accum.cpp
@@ -369,11 +369,17 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
     CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
               op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
 
-    int stype = _src.type(), cn = CV_MAT_CN(stype);
-    int sdepth = CV_MAT_DEPTH(stype), ddepth = _dst.depth();
+    const ocl::Device & dev = ocl::Device::getDefault();
+    int vectorWidths[] = { 4, 4, 2, 2, 1, 1, 1, -1 };
+    int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth();
+    int pcn = std::max(vectorWidths[sdepth], vectorWidths[ddepth]), sesz = CV_ELEM_SIZE(sdepth) * pcn,
+        desz = CV_ELEM_SIZE(ddepth) * pcn, rowsPerWI = dev.isIntel() ? 4 : 1;
 
-    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0,
-            haveMask = !_mask.empty();
+    bool doubleSupport = dev.doubleFPConfig() > 0, haveMask = !_mask.empty(),
+        usepcn = _src.offset() % sesz == 0 && _src.step() % sesz == 0 && (_src.cols() * cn) % pcn == 0 &&
+            _src2.offset() % desz == 0 && _src2.step() % desz == 0 &&
+            _dst.offset() % pcn == 0 && _dst.step() % desz == 0 && !haveMask;
+    int kercn = usepcn ? pcn : haveMask ? cn : 1;
 
     if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
         return false;
@@ -381,11 +387,13 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
     const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
                                    "ACCUMULATE_WEIGHTED" };
 
+    char cvt[40];
     ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
-                  format("-D %s%s -D srcT=%s -D cn=%d -D dstT=%s%s",
+                  format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s",
                          opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
-                         ocl::typeToStr(sdepth), cn, ocl::typeToStr(ddepth),
-                         doubleSupport ? " -D DOUBLE_SUPPORT" : ""));
+                         ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth),
+                         doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI,
+                         ocl::convertTypeStr(sdepth, ddepth, 1, cvt)));
     if (k.empty())
         return false;
 
@@ -393,7 +401,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
 
     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
             src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
-            dstarg = ocl::KernelArg::ReadWrite(dst),
+            dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn),
             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
 
     int argidx = k.set(0, srcarg);
@@ -410,7 +418,7 @@ static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray
     if (haveMask)
         k.set(argidx, maskarg);
 
-    size_t globalsize[2] = { src.cols, src.rows };
+    size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
     return k.run(2, globalsize, NULL, false);
 }
 
diff --git a/modules/imgproc/src/opencl/accumulate.cl b/modules/imgproc/src/opencl/accumulate.cl
index a60d4d6d9..f786f8038 100644
--- a/modules/imgproc/src/opencl/accumulate.cl
+++ b/modules/imgproc/src/opencl/accumulate.cl
@@ -13,13 +13,18 @@
 #endif
 #endif
 
+#define SRC_TSIZE cn * (int)sizeof(srcT1)
+#define DST_TSIZE cn * (int)sizeof(dstT1)
+
+#define noconvert
+
 __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_offset,
 #ifdef ACCUMULATE_PRODUCT
                          __global const uchar * src2ptr, int src2_step, int src2_offset,
 #endif
                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols
 #ifdef ACCUMULATE_WEIGHTED
-                         , dstT alpha
+                         , dstT1 alpha
 #endif
 #ifdef HAVE_MASK
                          , __global const uchar * mask, int mask_step, int mask_offset
@@ -27,39 +32,59 @@ __kernel void accumulate(__global const uchar * srcptr, int src_step, int src_of
                          )
 {
     int x = get_global_id(0);
-    int y = get_global_id(1);
+    int y = get_global_id(1) * rowsPerWI;
 
-    if (x < dst_cols && y < dst_rows)
+    if (x < dst_cols)
     {
-        int src_index = mad24(y, src_step, src_offset + x * cn * (int)sizeof(srcT));
+        int src_index = mad24(y, src_step, mad24(x, SRC_TSIZE, src_offset));
 #ifdef HAVE_MASK
         int mask_index = mad24(y, mask_step, mask_offset + x);
         mask += mask_index;
 #endif
-        int dst_index = mad24(y, dst_step, dst_offset + x * cn * (int)sizeof(dstT));
-
-        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
 #ifdef ACCUMULATE_PRODUCT
-        int src2_index = mad24(y, src2_step, src2_offset + x * cn * (int)sizeof(srcT));
-        __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
+        int src2_index = mad24(y, src2_step, mad24(x, SRC_TSIZE, src2_offset));
 #endif
-        __global dstT * dst = (__global dstT *)(dstptr + dst_index);
+        int dst_index = mad24(y, dst_step, mad24(x, DST_TSIZE, dst_offset));
 
         #pragma unroll
-        for (int c = 0; c < cn; ++c)
-#ifdef HAVE_MASK
-            if (mask[0])
+        for (int i = 0; i < rowsPerWI; ++i)
+            if (y < dst_rows)
+            {
+                __global const srcT1 * src = (__global const srcT1 *)(srcptr + src_index);
+#ifdef ACCUMULATE_PRODUCT
+                __global const srcT1 * src2 = (__global const srcT1 *)(src2ptr + src2_index);
 #endif
+                __global dstT1 * dst = (__global dstT1 *)(dstptr + dst_index);
+
+#ifdef HAVE_MASK
+                if (mask[0])
+#endif
+                    #pragma unroll
+                    for (int c = 0; c < cn; ++c)
+                    {
 #ifdef ACCUMULATE
-                dst[c] += src[c];
+                        dst[c] += convertToDT(src[c]);
 #elif defined ACCUMULATE_SQUARE
-                dst[c] += src[c] * src[c];
+                        dstT1 val = convertToDT(src[c]);
+                        dst[c] = fma(val, val, dst[c]);
 #elif defined ACCUMULATE_PRODUCT
-                dst[c] += src[c] * src2[c];
+                        dst[c] = fma(convertToDT(src[c]), convertToDT(src2[c]), dst[c]);
 #elif defined ACCUMULATE_WEIGHTED
-                dst[c] = (1 - alpha) * dst[c] + src[c] * alpha;
+                        dst[c] = fma(1 - alpha, dst[c], src[c] * alpha);
 #else
 #error "Unknown accumulation type"
 #endif
+                    }
+
+                src_index += src_step;
+#ifdef ACCUMULATE_PRODUCT
+                src2_index += src2_step;
+#endif
+#ifdef HAVE_MASK
+                mask += mask_step;
+#endif
+                dst_index += dst_step;
+                ++y;
+            }
     }
 }