Fixed gpu::matchTemplate for correct handling of big templates. Added tests

2012-03-26 09:19:33 +00:00
parent 098fc1a62e
commit 5434a9a5ec
4 changed files with 61 additions and 13 deletions
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -216,7 +216,7 @@ namespace cv { namespace gpu { namespace device
        // Prepared_SQDIFF

        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -232,7 +232,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int cn>
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream)
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
@@ -244,10 +244,10 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, int cn, 
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, int cn,
                                             cudaStream_t stream)
        {
-            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);

            static const caller_t callers[] = 
            {
@@ -284,7 +284,9 @@ namespace cv { namespace gpu { namespace device


        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result)
+        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
+                unsigned long long templ_sqsum, DevMem2Df result)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -301,7 +303,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <int cn>
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
                                                    DevMem2Df result, cudaStream_t stream)
        {
            const dim3 threads(32, 8);
@@ -315,10 +317,10 @@ namespace cv { namespace gpu { namespace device
        }


-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, 
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
                                                    DevMem2Df result, int cn, cudaStream_t stream)
        {
-            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, cudaStream_t stream);
+            typedef void (*caller_t)(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream);
            static const caller_t callers[] = 
            {
                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -1739,7 +1739,7 @@ namespace cv { namespace gpu { namespace device
            template <typename T>
            void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
            {
-                typedef typename SumType<T>::R R;
+                typedef double R;

                dim3 threads, grid;
                estimateThreadCfg(src.cols, src.rows, threads, grid);
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -62,10 +62,10 @@ namespace cv { namespace gpu { namespace device
        void matchTemplateNaive_SQDIFF_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
        void matchTemplateNaive_SQDIFF_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);

-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result,
            int cn, cudaStream_t stream);

-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned int templ_sqsum, DevMem2Df result, 
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum, unsigned long long templ_sqsum, DevMem2Df result,
            int cn, cudaStream_t stream);

        void matchTemplatePrepared_CCOFF_8U(int w, int h, const DevMem2D_<unsigned int> image_sum, unsigned int templ_sum, DevMem2Df result, cudaStream_t stream);
@@ -248,7 +248,7 @@ namespace
        GpuMat img_sqsum;
        sqrIntegral(image.reshape(1), img_sqsum, stream);

-        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
+        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];

        matchTemplate_CCORR_8U(image, templ, result, stream);
        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
@@ -260,7 +260,7 @@ namespace
        GpuMat img_sqsum;
        sqrIntegral(image.reshape(1), img_sqsum, stream);

-        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
+        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];

        matchTemplate_CCORR_8U(image, templ, result, stream);
        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));