diff --git a/modules/imgproc/src/opencl/match_template.cl b/modules/imgproc/src/opencl/match_template.cl
index ef7ba2d2d..7c80b3c16 100644
--- a/modules/imgproc/src/opencl/match_template.cl
+++ b/modules/imgproc/src/opencl/match_template.cl
@@ -68,35 +68,47 @@ inline float normAcc_SQDIFF(float num, float denum)
 
 #ifdef CALC_SUM
 
-__kernel void calcSum(__global const uchar * templateptr, int template_step, int template_offset,
-                     int template_rows, int template_cols, __global float * result)
+__kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset,
+                      int cols, int total, __global float * dst)
 {
-    __global const T * template = (__global const T *)(templateptr + template_offset);
+    int lid = get_local_id(0), id = get_global_id(0);
 
-    WT res = (WT)(0);
+    __local WT localmem[WGS2_ALIGNED];
+    WT accumulator = (WT)(0), tmp;
 
-    for (int y = 0; y < template_rows; ++y)
+    for ( ; id < total; id += WGS)
     {
-        for (int x = 0; x < template_cols; ++x)
-        {
-            WT value = convertToWT(template[x]);
-#ifdef SUM_2
-#if wdepth == 4
-            res = mad24(value, value, res);
-#else
-            res = mad(value, value, res);
-#endif
-#elif defined SUM_1
-            res += value;
-#else
-#error "No operation is specified"
-#endif
-        }
+        int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(T), src_offset));
+        __global const T * src = (__global const T *)(srcptr + src_index);
 
-        template = (__global const T *)((__global const uchar *)template + template_step);
+        tmp = convertToWT(src[0]);
+#if wdepth == 4
+        accumulator = mad24(tmp, tmp, accumulator);
+#else
+        accumulator = mad(tmp, tmp, accumulator);
+#endif
     }
 
-    result[0] = convertToDT(res);
+    if (lid < WGS2_ALIGNED)
+        localmem[lid] = accumulator;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)
+        localmem[lid - WGS2_ALIGNED] += accumulator;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)
+    {
+        if (lid < lsize)
+        {
+            int lid2 = lsize + lid;
+            localmem[lid] += localmem[lid2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+        dst[0] = convertToDT(localmem[0]);
 }
 
 #elif defined CCORR
diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp
index 2298b12a5..ca132dd0f 100644
--- a/modules/imgproc/src/templmatch.cpp
+++ b/modules/imgproc/src/templmatch.cpp
@@ -40,7 +40,6 @@
 //M*/
 
 #include "precomp.hpp"
-#define CV_OPENCL_RUN_ASSERT
 #include "opencl_kernels.hpp"
 
 ////////////////////////////////////////////////// matchTemplate //////////////////////////////////////////////////////////
@@ -57,28 +56,36 @@ enum
     SUM_1 = 0, SUM_2 = 1
 };
 
-static bool sumTemplate(InputArray _templ, UMat & result, int sum_type)
+static bool sumTemplate(InputArray _src, UMat & result)
 {
-    CV_Assert(sum_type == SUM_1 || sum_type == SUM_2);
-
-    int type = _templ.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     int wdepth = std::max(CV_32S, depth), wtype = CV_MAKE_TYPE(wdepth, cn);
+    size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
+
+    int wgs2_aligned = 1;
+    while (wgs2_aligned < (int)wgs)
+        wgs2_aligned <<= 1;
+    wgs2_aligned >>= 1;
 
     char cvt[40];
-    const char * const sumTypeToStr[] = { "SUM_1", "SUM_2" };
     ocl::Kernel k("calcSum", ocl::imgproc::match_template_oclsrc,
-                  format("-D CALC_SUM -D %s -D T=%s -D WT=%s -D convertToWT=%s -D cn=%d -D wdepth=%d",
-                         sumTypeToStr[sum_type], ocl::typeToStr(type), ocl::typeToStr(wtype),
-                         ocl::convertTypeStr(depth, wdepth, cn, cvt), cn, wdepth));
+                  format("-D CALC_SUM -D T=%s -D WT=%s -D cn=%d -D convertToWT=%s -D WGS=%d -D WGS2_ALIGNED=%d -D wdepth=%d",
+                         ocl::typeToStr(type), ocl::typeToStr(wtype), cn,
+                         ocl::convertTypeStr(depth, wdepth, cn, cvt),
+                         (int)wgs, wgs2_aligned, wdepth));
     if (k.empty())
         return false;
 
+    UMat src = _src.getUMat();
     result.create(1, 1, CV_32FC1);
-    UMat templ = _templ.getUMat();
 
-    k.args(ocl::KernelArg::ReadOnly(templ), ocl::KernelArg::PtrWriteOnly(result));
+    ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
+            resarg = ocl::KernelArg::PtrWriteOnly(result);
 
-    return k.runTask(false);
+    k.args(srcarg, src.cols, (int)src.total(), resarg);
+
+    size_t globalsize = wgs;
+    return k.run(1, &globalsize, &wgs, false);
 }
 
 static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result)
@@ -123,7 +130,7 @@ static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, Out
     integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
 
     UMat templ_sqsum;
-    if (!sumTemplate(templ, templ_sqsum, SUM_2))
+    if (!sumTemplate(templ, templ_sqsum))
         return false;
 
     k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
@@ -177,7 +184,7 @@ static bool matchTemplate_SQDIFF_NORMED(InputArray _image, InputArray _templ, Ou
     integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
 
     UMat templ_sqsum;
-    if (!sumTemplate(_templ, templ_sqsum, SUM_2))
+    if (!sumTemplate(_templ, templ_sqsum))
         return false;
 
     k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),