From 26afa49d710603b225fc6cb8b9aa3e2369980a99 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Thu, 25 Dec 2014 15:41:14 +0300
Subject: [PATCH] fix cuda match template:

use correct types for integral/sum outputs
---
 .../cudaimgproc/src/cuda/match_template.cu    | 146 +++++++++---------
 modules/cudaimgproc/src/match_template.cpp    | 124 +++++++--------
 2 files changed, 135 insertions(+), 135 deletions(-)
diff --git a/modules/cudaimgproc/src/cuda/match_template.cu b/modules/cudaimgproc/src/cuda/match_template.cu
index 832878f9f..87ee71e1e 100644
--- a/modules/cudaimgproc/src/cuda/match_template.cu
+++ b/modules/cudaimgproc/src/cuda/match_template.cu
@@ -218,7 +218,7 @@ namespace cv { namespace cuda { namespace device
         // Prepared_SQDIFF
 
         template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<double> image_sqsum, double templ_sqsum, PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -234,7 +234,7 @@ namespace cv { namespace cuda { namespace device
         }
 
         template <int cn>
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream)
         {
             const dim3 threads(32, 8);
             const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -246,10 +246,10 @@ namespace cv { namespace cuda { namespace device
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, int cn,
                                              cudaStream_t stream)
         {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
 
             static const caller_t callers[] =
             {
@@ -287,8 +287,8 @@ namespace cv { namespace cuda { namespace device
 
         template <int cn>
         __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
+                int w, int h, const PtrStep<double> image_sqsum,
+                double templ_sqsum, PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -305,7 +305,7 @@ namespace cv { namespace cuda { namespace device
         }
 
         template <int cn>
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
                                                     PtrStepSzf result, cudaStream_t stream)
         {
             const dim3 threads(32, 8);
@@ -319,10 +319,10 @@ namespace cv { namespace cuda { namespace device
         }
 
 
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum,
                                                     PtrStepSzf result, int cn, cudaStream_t stream)
         {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result, cudaStream_t stream);
             static const caller_t callers[] =
             {
                 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
@@ -334,7 +334,7 @@ namespace cv { namespace cuda { namespace device
         //////////////////////////////////////////////////////////////////////
         // Prepared_CCOFF
 
-        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<int> image_sum, PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -349,7 +349,7 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -365,8 +365,8 @@ namespace cv { namespace cuda { namespace device
 
         __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
                 int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
                 PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -388,9 +388,9 @@ namespace cv { namespace cuda { namespace device
 
         void matchTemplatePrepared_CCOFF_8UC2(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                unsigned int templ_sum_r, unsigned int templ_sum_g,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                int templ_sum_r, int templ_sum_g,
                 PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -412,9 +412,9 @@ namespace cv { namespace cuda { namespace device
                 float templ_sum_scale_r,
                 float templ_sum_scale_g,
                 float templ_sum_scale_b,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
+                const PtrStep<int> image_sum_b,
                 PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -440,12 +440,12 @@ namespace cv { namespace cuda { namespace device
 
         void matchTemplatePrepared_CCOFF_8UC3(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
                 PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -471,10 +471,10 @@ namespace cv { namespace cuda { namespace device
                 float templ_sum_scale_g,
                 float templ_sum_scale_b,
                 float templ_sum_scale_a,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                const PtrStep<unsigned int> image_sum_a,
+                const PtrStep<int> image_sum_r,
+                const PtrStep<int> image_sum_g,
+                const PtrStep<int> image_sum_b,
+                const PtrStep<int> image_sum_a,
                 PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -504,14 +504,14 @@ namespace cv { namespace cuda { namespace device
 
         void matchTemplatePrepared_CCOFF_8UC4(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                const PtrStepSz<int> image_sum_a,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                int templ_sum_a,
                 PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -537,8 +537,8 @@ namespace cv { namespace cuda { namespace device
         __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
                 int w, int h, float weight,
                 float templ_sum_scale, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum,
-                const PtrStep<unsigned long long> image_sqsum,
+                const PtrStep<int> image_sum,
+                const PtrStep<double> image_sqsum,
                 PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -559,9 +559,9 @@ namespace cv { namespace cuda { namespace device
         }
 
         void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const PtrStepSz<unsigned int> image_sum,
-                    const PtrStepSz<unsigned long long> image_sqsum,
-                    unsigned int templ_sum, unsigned long long templ_sqsum,
+                    int w, int h, const PtrStepSz<int> image_sum,
+                    const PtrStepSz<double> image_sqsum,
+                    int templ_sum, double templ_sqsum,
                     PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -586,8 +586,8 @@ namespace cv { namespace cuda { namespace device
                 int w, int h, float weight,
                 float templ_sum_scale_r, float templ_sum_scale_g,
                 float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
                 PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -618,10 +618,10 @@ namespace cv { namespace cuda { namespace device
 
         void matchTemplatePrepared_CCOFF_NORMED_8UC2(
                     int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
                     PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -652,9 +652,9 @@ namespace cv { namespace cuda { namespace device
                 int w, int h, float weight,
                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                 float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
+                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
                 PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -693,12 +693,12 @@ namespace cv { namespace cuda { namespace device
 
         void matchTemplatePrepared_CCOFF_NORMED_8UC3(
                     int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
+                    int templ_sum_b, double templ_sqsum_b,
                     PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -732,10 +732,10 @@ namespace cv { namespace cuda { namespace device
                 int w, int h, float weight,
                 float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
                 float templ_sum_scale_a, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
+                const PtrStep<int> image_sum_r, const PtrStep<double> image_sqsum_r,
+                const PtrStep<int> image_sum_g, const PtrStep<double> image_sqsum_g,
+                const PtrStep<int> image_sum_b, const PtrStep<double> image_sqsum_b,
+                const PtrStep<int> image_sum_a, const PtrStep<double> image_sqsum_a,
                 PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -780,14 +780,14 @@ namespace cv { namespace cuda { namespace device
 
         void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                     int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                    const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                    const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                    const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                    const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
+                    int templ_sum_r, double templ_sqsum_r,
+                    int templ_sum_g, double templ_sqsum_g,
+                    int templ_sum_b, double templ_sqsum_b,
+                    int templ_sum_a, double templ_sqsum_a,
                     PtrStepSzf result, cudaStream_t stream)
         {
             dim3 threads(32, 8);
@@ -823,8 +823,8 @@ namespace cv { namespace cuda { namespace device
 
         template <int cn>
         __global__ void normalizeKernel_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
+                int w, int h, const PtrStep<double> image_sqsum,
+                double templ_sqsum, PtrStepSzf result)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -838,8 +838,8 @@ namespace cv { namespace cuda { namespace device
             }
         }
 
-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
+        void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
+                          double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
         {
             dim3 threads(32, 8);
             dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
diff --git a/modules/cudaimgproc/src/match_template.cpp b/modules/cudaimgproc/src/match_template.cpp
index 19d091588..c5ab143ec 100644
--- a/modules/cudaimgproc/src/match_template.cpp
+++ b/modules/cudaimgproc/src/match_template.cpp
@@ -61,77 +61,77 @@ namespace cv { namespace cuda { namespace device
         void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
         void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
 
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
             int cn, cudaStream_t stream);
 
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<double> image_sqsum, double templ_sqsum, PtrStepSzf result,
             int cn, cudaStream_t stream);
 
-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<int> image_sum, int templ_sum, PtrStepSzf result, cudaStream_t stream);
         void matchTemplatePrepared_CCOFF_8UC2(
             int w, int h,
-            const PtrStepSz<unsigned int> image_sum_r,
-            const PtrStepSz<unsigned int> image_sum_g,
-            unsigned int templ_sum_r,
-            unsigned int templ_sum_g,
+            const PtrStepSz<int> image_sum_r,
+            const PtrStepSz<int> image_sum_g,
+            int templ_sum_r,
+            int templ_sum_g,
             PtrStepSzf result, cudaStream_t stream);
         void matchTemplatePrepared_CCOFF_8UC3(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
                 PtrStepSzf result, cudaStream_t stream);
         void matchTemplatePrepared_CCOFF_8UC4(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
+                const PtrStepSz<int> image_sum_r,
+                const PtrStepSz<int> image_sum_g,
+                const PtrStepSz<int> image_sum_b,
+                const PtrStepSz<int> image_sum_a,
+                int templ_sum_r,
+                int templ_sum_g,
+                int templ_sum_b,
+                int templ_sum_a,
                 PtrStepSzf result, cudaStream_t stream);
 
 
         void matchTemplatePrepared_CCOFF_NORMED_8U(
-                int w, int h, const PtrStepSz<unsigned int> image_sum,
-                const PtrStepSz<unsigned long long> image_sqsum,
-                unsigned int templ_sum, unsigned long long templ_sqsum,
+                int w, int h, const PtrStepSz<int> image_sum,
+                const PtrStepSz<double> image_sqsum,
+                int templ_sum, double templ_sqsum,
                 PtrStepSzf result, cudaStream_t stream);
         void matchTemplatePrepared_CCOFF_NORMED_8UC2(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
                 PtrStepSzf result, cudaStream_t stream);
         void matchTemplatePrepared_CCOFF_NORMED_8UC3(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
+                int templ_sum_b, double templ_sqsum_b,
                 PtrStepSzf result, cudaStream_t stream);
         void matchTemplatePrepared_CCOFF_NORMED_8UC4(
                 int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                const PtrStepSz<int> image_sum_r, const PtrStepSz<double> image_sqsum_r,
+                const PtrStepSz<int> image_sum_g, const PtrStepSz<double> image_sqsum_g,
+                const PtrStepSz<int> image_sum_b, const PtrStepSz<double> image_sqsum_b,
+                const PtrStepSz<int> image_sum_a, const PtrStepSz<double> image_sqsum_a,
+                int templ_sum_r, double templ_sqsum_r,
+                int templ_sum_g, double templ_sqsum_g,
+                int templ_sum_b, double templ_sqsum_b,
+                int templ_sum_a, double templ_sqsum_a,
                 PtrStepSzf result, cudaStream_t stream);
 
-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
+        void normalize_8U(int w, int h, const PtrStepSz<double> image_sqsum,
+                          double templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
 
         void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
     }
@@ -290,7 +290,7 @@ namespace
 
         cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
 
-        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
 
         normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
     }
@@ -361,7 +361,7 @@ namespace
 
         cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
 
-        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
 
         match_CCORR_.match(image, templ, _result, stream);
         GpuMat result = _result.getGpuMat();
@@ -400,7 +400,7 @@ namespace
 
         cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
 
-        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+        double templ_sqsum = cuda::sqrSum(templ.reshape(1))[0];
 
         match_CCORR_.match(image, templ, _result, stream);
         GpuMat result = _result.getGpuMat();
@@ -446,7 +446,7 @@ namespace
             image_sums_.resize(1);
             cuda::integral(image, image_sums_[0], intBuffer_, stream);
 
-            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
+            int templ_sum = (int) cuda::sum(templ)[0];
 
             matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
         }
@@ -465,19 +465,19 @@ namespace
             case 2:
                 matchTemplatePrepared_CCOFF_8UC2(
                         templ.cols, templ.rows, image_sums_[0], image_sums_[1],
-                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1],
+                        (int) templ_sum[0], (int) templ_sum[1],
                         result, StreamAccessor::getStream(stream));
                 break;
             case 3:
                 matchTemplatePrepared_CCOFF_8UC3(
                         templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
-                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2],
+                        (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2],
                         result, StreamAccessor::getStream(stream));
                 break;
             case 4:
                 matchTemplatePrepared_CCOFF_8UC4(
                         templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
-                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3],
+                        (int) templ_sum[0], (int) templ_sum[1], (int) templ_sum[2], (int) templ_sum[3],
                         result, StreamAccessor::getStream(stream));
                 break;
             default:
@@ -532,8 +532,8 @@ namespace
             image_sqsums_.resize(1);
             cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
 
-            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
-            unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ)[0];
+            int templ_sum = (int) cuda::sum(templ)[0];
+            double templ_sqsum = cuda::sqrSum(templ)[0];
 
             matchTemplatePrepared_CCOFF_NORMED_8U(
                     templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
@@ -561,8 +561,8 @@ namespace
                         templ.cols, templ.rows,
                         image_sums_[0], image_sqsums_[0],
                         image_sums_[1], image_sqsums_[1],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
                         result, StreamAccessor::getStream(stream));
                 break;
             case 3:
@@ -571,9 +571,9 @@ namespace
                         image_sums_[0], image_sqsums_[0],
                         image_sums_[1], image_sqsums_[1],
                         image_sums_[2], image_sqsums_[2],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
+                        (int)templ_sum[2], templ_sqsum[2],
                         result, StreamAccessor::getStream(stream));
                 break;
             case 4:
@@ -583,10 +583,10 @@ namespace
                         image_sums_[1], image_sqsums_[1],
                         image_sums_[2], image_sqsums_[2],
                         image_sums_[3], image_sqsums_[3],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
-                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
+                        (int)templ_sum[0], templ_sqsum[0],
+                        (int)templ_sum[1], templ_sqsum[1],
+                        (int)templ_sum[2], templ_sqsum[2],
+                        (int)templ_sum[3], templ_sqsum[3],
                         result, StreamAccessor::getStream(stream));
                 break;
             default: