diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index c1e35d72c..d32aa327a 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -604,6 +604,11 @@ namespace cv
         //! supports only CV_8UC1 source type
         CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum);
 
+        //! computes squared integral image
+        //! result matrix will have 64F type, but will contain 64U values
+        //! supports source images of 8UC1 type only
+        CV_EXPORTS void sqrIntegral(const GpuMat& src, GpuMat& sqsum);
+
         //! computes vertical sum, supports only CV_32FC1 images
         CV_EXPORTS void columnSum(const GpuMat& src, GpuMat& sum);
 
diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp
index 7374c93b2..3eef44c8d 100644
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -62,6 +62,7 @@ void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int) { t
 void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int) { throw_nogpu(); }
 void cv::gpu::integral(const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int) { throw_nogpu(); }
@@ -585,6 +586,28 @@ void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum)
         sum.step, sqsum.ptr<Npp32f>(), sqsum.step, sz, 0, 0.0f, h) );
 }
 
+//////////////////////////////////////////////////////////////////////////////
+// sqrIntegral
+
+void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum)
+{
+    CV_Assert(src.type() == CV_8U);
+
+    NppStSize32u roiSize;
+    roiSize.width = src.cols;
+    roiSize.height = src.rows;
+
+    NppSt32u bufSize;
+    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize));
+    GpuMat buf(1, bufSize, CV_8U);
+
+    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
+    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(
+            const_cast<NppSt8u*>(src.ptr<NppSt8u>(0)), src.step, 
+            sqsum.ptr<NppSt64u>(0), sqsum.step, roiSize, 
+            buf.ptr<NppSt8u>(0), bufSize));
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // columnSum
 
diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp
index 70141371a..7ff33da6e 100644
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -155,14 +155,6 @@ namespace cv { namespace gpu { namespace imgproc
 
 namespace 
 {
-    // Computes integral image. Result matrix will have data type 32S,
-    // while actuall data type is 32U
-    void integral_8U_32U(const GpuMat& src, GpuMat& sum);
-
-    // Computes squared integral image. Result matrix will have data type 64F,
-    // while actual data type is 64U
-    void sqrIntegral_8U_64U(const GpuMat& src, GpuMat& sqsum);
-
     // Estimates optimal blocks size for FFT method
     void estimateBlockSize(int w, int h, int tw, int th, int& bw, int& bh);
 
@@ -183,47 +175,7 @@ namespace
     void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result);
 
     void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result);
-    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result);
-
-
-    void integral_8U_32U(const GpuMat& src, GpuMat& sum)
-    {
-        CV_Assert(src.type() == CV_8U);
-
-        NppStSize32u roiSize;
-        roiSize.width = src.cols;
-        roiSize.height = src.rows;
-
-        NppSt32u bufSize;
-        nppSafeCall(nppiStIntegralGetSize_8u32u(roiSize, &bufSize));
-        GpuMat buf(1, bufSize, CV_8U);
-
-        sum.create(src.rows + 1, src.cols + 1, CV_32S);
-        nppSafeCall(nppiStIntegral_8u32u_C1R(
-                const_cast<NppSt8u*>(src.ptr<NppSt8u>(0)), src.step, 
-                sum.ptr<NppSt32u>(0), sum.step, roiSize, 
-                buf.ptr<NppSt8u>(0), bufSize));
-    }
-
-
-    void sqrIntegral_8U_64U(const GpuMat& src, GpuMat& sqsum)
-    {
-        CV_Assert(src.type() == CV_8U);
-
-        NppStSize32u roiSize;
-        roiSize.width = src.cols;
-        roiSize.height = src.rows;
-
-        NppSt32u bufSize;
-        nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize));
-        GpuMat buf(1, bufSize, CV_8U);
-
-        sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-        nppSafeCall(nppiStSqrIntegral_8u64u_C1R(
-                const_cast<NppSt8u*>(src.ptr<NppSt8u>(0)), src.step, 
-                sqsum.ptr<NppSt64u>(0), sqsum.step, roiSize, 
-                buf.ptr<NppSt8u>(0), bufSize));
-    }
+    void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); 
 
 
     void estimateBlockSize(int w, int h, int tw, int th, int& bw, int& bh)
@@ -384,7 +336,7 @@ namespace
         matchTemplate_CCORR_8U(image, templ, result);
 
         GpuMat img_sqsum;
-        sqrIntegral_8U_64U(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum);
 
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
         imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, 
@@ -409,7 +361,7 @@ namespace
         }
 
         GpuMat img_sqsum;
-        sqrIntegral_8U_64U(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum);
 
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
 
@@ -422,7 +374,7 @@ namespace
     void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result)
     {
         GpuMat img_sqsum;
-        sqrIntegral_8U_64U(image.reshape(1), img_sqsum);
+        sqrIntegral(image.reshape(1), img_sqsum);
 
         unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
 
@@ -439,7 +391,7 @@ namespace
         if (image.channels() == 1)
         {
             GpuMat image_sum;
-            integral_8U_32U(image, image_sum);
+            integral(image, image_sum);
 
             unsigned int templ_sum = (unsigned int)sum(templ)[0];
             imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, 
@@ -452,7 +404,7 @@ namespace
 
             split(image, images);
             for (int i = 0; i < image.channels(); ++i)
-                integral_8U_32U(images[i], image_sums[i]);
+                integral(images[i], image_sums[i]);
 
             Scalar templ_sum = sum(templ);
 
@@ -493,8 +445,8 @@ namespace
         if (image.channels() == 1)
         {
             GpuMat image_sum, image_sqsum;
-            integral_8U_32U(image, image_sum);
-            sqrIntegral_8U_64U(image, image_sqsum);
+            integral(image, image_sum);
+            sqrIntegral(image, image_sqsum);
 
             unsigned int templ_sum = (unsigned int)sum(templ)[0];
             unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];
@@ -512,8 +464,8 @@ namespace
             split(image, images);
             for (int i = 0; i < image.channels(); ++i)
             {
-                integral_8U_32U(images[i], image_sums[i]);
-                sqrIntegral_8U_64U(images[i], image_sqsums[i]);
+                integral(images[i], image_sums[i]);
+                sqrIntegral(images[i], image_sqsums[i]);
             }
 
             Scalar templ_sum = sum(templ);