refactored gpu::matchTemplate (converted it into Algorithm)

2013-04-30 17:27:06 +04:00
parent 1fcc8074bd
commit de56163f97
5 changed files with 392 additions and 156 deletions
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -424,20 +424,24 @@ CV_EXPORTS void meanShiftSegmentation(InputArray src, OutputArray dst, int sp, i
 /////////////////////////// Match Template ////////////////////////////
-struct CV_EXPORTS MatchTemplateBuf
+//! computes the proximity map for the raster template and the image where the template is searched for
 class CV_EXPORTS TemplateMatching : public Algorithm
 {
-    Size user_block_size;
+public:
-    GpuMat imagef, templf;
+    virtual void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null()) = 0;
    std::vector<GpuMat> images;
    std::vector<GpuMat> image_sums;
    std::vector<GpuMat> image_sqsums;
 };
-//! computes the proximity map for the raster template and the image where the template is searched for
+CV_EXPORTS Ptr<TemplateMatching> createTemplateMatching(int srcType, int method, Size user_block_size = Size());
 CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null());
-//! computes the proximity map for the raster template and the image where the template is searched for
+// obsolete
-CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null());
+
 __OPENCV_GPUIMGPROC_DEPR_BEFORE__ void matchTemplate(InputArray image, InputArray templ, OutputArray result,
                                                     int method, Stream& stream = Stream::Null()) __OPENCV_GPUIMGPROC_DEPR_AFTER__;
 inline void matchTemplate(InputArray image, InputArray templ, OutputArray result, int method, Stream& stream)
 {
    gpu::createTemplateMatching(image.type(), method)->match(image, templ, result, stream);
 }
 ////////////////////////// Bilateral Filter ///////////////////////////
--- a/modules/gpuimgproc/perf/perf_match_template.cpp
+++ b/modules/gpuimgproc/perf/perf_match_template.cpp
@@ -76,7 +76,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate8U,
        const cv::gpu::GpuMat d_templ(templ);
        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
        TEST_CYCLE() alg->match(d_image, d_templ, dst);
        GPU_SANITY_CHECK(dst, 1e-5, ERROR_RELATIVE);
    }
@@ -116,7 +118,9 @@ PERF_TEST_P(Sz_TemplateSz_Cn_Method, MatchTemplate32F,
        const cv::gpu::GpuMat d_templ(templ);
        cv::gpu::GpuMat dst;
-        TEST_CYCLE() cv::gpu::matchTemplate(d_image, d_templ, dst, method);
+        cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
        TEST_CYCLE() alg->match(d_image, d_templ, dst);
        GPU_SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
    }
--- a/modules/gpuimgproc/src/match_template.cpp
+++ b/modules/gpuimgproc/src/match_template.cpp
@@ -47,7 +47,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_GPUARITHM) || defined (CUDA_DISABLER)
-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+Ptr<gpu::TemplateMatching> cv::gpu::createTemplateMatching(int, int, Size) { throw_no_cuda(); return Ptr<gpu::TemplateMatching>(); }
 #else
@@ -137,11 +137,8 @@ namespace cv { namespace gpu { namespace cudev
    }
 }}}
 using namespace ::cv::gpu::cudev::match_template;
 namespace
 {
    // Evaluates optimal template's area threshold. If
    // template's area is less  than the threshold, we use naive match
    // template version, otherwise FFT-based (if available)
@@ -149,135 +146,317 @@ namespace
    {
        switch (method)
        {
-        case cv::TM_CCORR:
+        case TM_CCORR:
            if (depth == CV_32F) return 250;
            if (depth == CV_8U) return 300;
            break;
-        case cv::TM_SQDIFF:
+
        case TM_SQDIFF:
            if (depth == CV_8U) return 300;
            break;
        }
-        CV_Error(cv::Error::StsBadArg, "getTemplateThreshold: unsupported match template mode");
+
        CV_Error(Error::StsBadArg, "unsupported match template mode");
        return 0;
    }
    ///////////////////////////////////////////////////////////////
    // CCORR_32F
-    void matchTemplate_CCORR_32F(
+    class Match_CCORR_32F : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+    public:
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_32F))
+        explicit Match_CCORR_32F(Size user_block_size);
        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
    private:
        Ptr<gpu::Convolution> conv_;
        GpuMat result_;
    };
    Match_CCORR_32F::Match_CCORR_32F(Size user_block_size)
    {
        conv_ = gpu::createConvolution(user_block_size);
    }
    void Match_CCORR_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& _stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_32F );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        cudaStream_t stream = StreamAccessor::getStream(_stream);
        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
        GpuMat result = _result.getGpuMat();
        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_32F))
        {
-            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), stream);
            return;
        }
        Ptr<gpu::Convolution> conv = gpu::createConvolution(buf.user_block_size);
        if (image.channels() == 1)
        {
-            conv->convolve(image.reshape(1), templ.reshape(1), result, true, stream);
+            conv_->convolve(image.reshape(1), templ.reshape(1), result, true, _stream);
        }
        else
        {
-            GpuMat result_;
+            conv_->convolve(image.reshape(1), templ.reshape(1), result_, true, _stream);
-            conv->convolve(image.reshape(1), templ.reshape(1), result_, true, stream);
+            extractFirstChannel_32F(result_, result, image.channels(), stream);
            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
        }
    }
    ///////////////////////////////////////////////////////////////
    // CCORR_8U
-    void matchTemplate_CCORR_8U(
+    class Match_CCORR_8U : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_8U))
+    public:
        explicit Match_CCORR_8U(Size user_block_size) : match32F_(user_block_size)
        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        }
        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
    private:
        GpuMat imagef_, templf_;
        Match_CCORR_32F match32F_;
    };
    void Match_CCORR_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_8U))
        {
            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
            GpuMat result = _result.getGpuMat();
            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }
-        image.convertTo(buf.imagef, CV_32F, stream);
+        image.convertTo(imagef_, CV_32F, stream);
-        templ.convertTo(buf.templf, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+        match32F_.match(imagef_, templf_, _result, stream);
    }
    ///////////////////////////////////////////////////////////////
    // CCORR_NORMED_8U
-    void matchTemplate_CCORR_NORMED_8U(
+    class Match_CCORR_NORMED_8U : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+    public:
        explicit Match_CCORR_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
        }
-        buf.image_sqsums.resize(1);
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
+    private:
-        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        Match_CCORR_8U match_CCORR_;
        GpuMat image_sqsums_;
        GpuMat intBuffer_;
    };
    void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();
        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
        normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }
    ///////////////////////////////////////////////////////////////
    // SQDIFF_32F
-    void matchTemplate_SQDIFF_32F(
+    class Match_SQDIFF_32F : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        (void)buf;
+    public:
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
    };
    void Match_SQDIFF_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_32F );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
        GpuMat result = _result.getGpuMat();
        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }
    ///////////////////////////////////////////////////////////////
    // SQDIFF_8U
-    void matchTemplate_SQDIFF_8U(
+    class Match_SQDIFF_8U : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_SQDIFF, CV_8U))
+    public:
        explicit Match_SQDIFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
+        }
        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
    private:
        GpuMat image_sqsums_;
        GpuMat intBuffer_;
        Match_CCORR_8U match_CCORR_;
    };
    void Match_SQDIFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        if (templ.size().area() < getTemplateThreshold(TM_SQDIFF, CV_8U))
        {
            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
            GpuMat result = _result.getGpuMat();
            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }
-        buf.image_sqsums.resize(1);
+        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
+        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+        match_CCORR_.match(image, templ, _result, stream);
-        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        GpuMat result = _result.getGpuMat();
        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }
    ///////////////////////////////////////////////////////////////
    // SQDIFF_NORMED_8U
-    void matchTemplate_SQDIFF_NORMED_8U(
+    class Match_SQDIFF_NORMED_8U : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        buf.image_sqsums.resize(1);
+    public:
-        gpu::sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
+        explicit Match_SQDIFF_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
        }
-        unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ.reshape(1))[0];
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+    private:
-        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        GpuMat image_sqsums_;
        GpuMat intBuffer_;
        Match_CCORR_8U match_CCORR_;
    };
    void Match_SQDIFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        gpu::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
        unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ.reshape(1))[0];
        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();
        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }
    ///////////////////////////////////////////////////////////////
    // CCOFF_8U
-    void matchTemplate_CCOFF_8U(
+    class Match_CCOEFF_8U : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
+    public:
        explicit Match_CCOEFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
        {
        }
        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
    private:
        GpuMat intBuffer_;
        std::vector<GpuMat> images_;
        std::vector<GpuMat> image_sums_;
        Match_CCORR_8U match_CCORR_;
    };
    void Match_CCOEFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        match_CCORR_.match(image, templ, _result, stream);
        GpuMat result = _result.getGpuMat();
        if (image.channels() == 1)
        {
-            buf.image_sums.resize(1);
+            image_sums_.resize(1);
-            gpu::integral(image, buf.image_sums[0], stream);
+            gpu::integral(image, image_sums_[0], intBuffer_, stream);
-            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
+            unsigned int templ_sum = (unsigned int) gpu::sum(templ)[0];
-            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
+
            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
        }
        else
        {
-            gpu::split(image, buf.images);
+            gpu::split(image, images_);
-            buf.image_sums.resize(buf.images.size());
+
            image_sums_.resize(images_.size());
            for (int i = 0; i < image.channels(); ++i)
-                gpu::integral(buf.images[i], buf.image_sums[i], stream);
+                gpu::integral(images_[i], image_sums_[i], intBuffer_, stream);
            Scalar templ_sum = gpu::sum(templ);
@@ -285,60 +464,91 @@ namespace
            {
            case 2:
                matchTemplatePrepared_CCOFF_8UC2(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1],
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1],
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
                matchTemplatePrepared_CCOFF_8UC3(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2],
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2],
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
                matchTemplatePrepared_CCOFF_8UC4(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3],
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3],
-                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
+                        result, StreamAccessor::getStream(stream));
                break;
            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
            }
        }
    }
    ///////////////////////////////////////////////////////////////
    // CCOFF_NORMED_8U
-    void matchTemplate_CCOFF_NORMED_8U(
+    class Match_CCOEFF_NORMED_8U : public TemplateMatching
            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
    {
-        image.convertTo(buf.imagef, CV_32F, stream);
+    public:
-        templ.convertTo(buf.templf, CV_32F, stream);
+        explicit Match_CCOEFF_NORMED_8U(Size user_block_size) : match_CCORR_32F_(user_block_size)
        {
        }
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
    private:
        GpuMat imagef_, templf_;
        Match_CCORR_32F match_CCORR_32F_;
        GpuMat intBuffer_;
        std::vector<GpuMat> images_;
        std::vector<GpuMat> image_sums_;
        std::vector<GpuMat> image_sqsums_;
    };
    void Match_CCOEFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
    {
        using namespace cv::gpu::cudev::match_template;
        GpuMat image = _image.getGpuMat();
        GpuMat templ = _templ.getGpuMat();
        CV_Assert( image.depth() == CV_8U );
        CV_Assert( image.type() == templ.type() );
        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
        image.convertTo(imagef_, CV_32F, stream);
        templ.convertTo(templf_, CV_32F, stream);
        match_CCORR_32F_.match(imagef_, templf_, _result, stream);
        GpuMat result = _result.getGpuMat();
        if (image.channels() == 1)
        {
-            buf.image_sums.resize(1);
+            image_sums_.resize(1);
-            gpu::integral(image, buf.image_sums[0], stream);
+            gpu::integral(image, image_sums_[0], intBuffer_, stream);
            buf.image_sqsums.resize(1);
            gpu::sqrIntegral(image, buf.image_sqsums[0], stream);
-            unsigned int templ_sum = (unsigned int)gpu::sum(templ)[0];
+            image_sqsums_.resize(1);
-            unsigned long long templ_sqsum = (unsigned long long)gpu::sqrSum(templ)[0];
+            gpu::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
            unsigned int templ_sum = (unsigned int) gpu::sum(templ)[0];
            unsigned long long templ_sqsum = (unsigned long long) gpu::sqrSum(templ)[0];
            matchTemplatePrepared_CCOFF_NORMED_8U(
-                    templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0],
+                    templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
        }
        else
        {
-            gpu::split(image, buf.images);
+            gpu::split(image, images_);
-            buf.image_sums.resize(buf.images.size());
+
-            buf.image_sqsums.resize(buf.images.size());
+            image_sums_.resize(images_.size());
            image_sqsums_.resize(images_.size());
            for (int i = 0; i < image.channels(); ++i)
            {
-                gpu::integral(buf.images[i], buf.image_sums[i], stream);
+                gpu::integral(images_[i], image_sums_[i], intBuffer_, stream);
-                gpu::sqrIntegral(buf.images[i], buf.image_sqsums[i], stream);
+                gpu::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream);
            }
            Scalar templ_sum = gpu::sum(templ);
@@ -349,8 +559,8 @@ namespace
            case 2:
                matchTemplatePrepared_CCOFF_NORMED_8UC2(
                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
+                        image_sums_[0], image_sqsums_[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
+                        image_sums_[1], image_sqsums_[1],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        result, StreamAccessor::getStream(stream));
@@ -358,9 +568,9 @@ namespace
            case 3:
                matchTemplatePrepared_CCOFF_NORMED_8UC3(
                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
+                        image_sums_[0], image_sqsums_[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
+                        image_sums_[1], image_sqsums_[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
+                        image_sums_[2], image_sqsums_[2],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
@@ -369,10 +579,10 @@ namespace
            case 4:
                matchTemplatePrepared_CCOFF_NORMED_8UC4(
                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
+                        image_sums_[0], image_sqsums_[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
+                        image_sums_[1], image_sqsums_[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
+                        image_sums_[2], image_sqsums_[2],
-                        buf.image_sums[3], buf.image_sqsums[3],
+                        image_sums_[3], image_sqsums_[3],
                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
@@ -380,46 +590,60 @@ namespace
                        result, StreamAccessor::getStream(stream));
                break;
            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
            }
        }
    }
 }
-
+Ptr<gpu::TemplateMatching> cv::gpu::createTemplateMatching(int srcType, int method, Size user_block_size)
 void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
 {
-    MatchTemplateBuf buf;
+    const int sdepth = CV_MAT_DEPTH(srcType);
    matchTemplate(image, templ, result, method, buf, stream);
 }
    CV_Assert( sdepth == CV_8U || sdepth == CV_32F );
-void cv::gpu::matchTemplate(
+    if (sdepth == CV_32F)
        const GpuMat& image, const GpuMat& templ, GpuMat& result, int method,
        MatchTemplateBuf &buf, Stream& stream)
 {
    CV_Assert(image.type() == templ.type());
    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream);
    static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
                                        ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
                                        ::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U };
    static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0,
                                         ::matchTemplate_CCORR_32F, 0, 0, 0 };
    const Caller* callers = 0;
    switch (image.depth())
    {
-        case CV_8U: callers = callers8U; break;
+        switch (method)
-        case CV_32F: callers = callers32F; break;
+        {
-        default: CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported data type");
+        case TM_SQDIFF:
-    }
+            return new Match_SQDIFF_32F;
-    Caller caller = callers[method];
+        case TM_CCORR:
-    CV_Assert(caller);
+            return new Match_CCORR_32F(user_block_size);
-    caller(image, templ, result, buf, stream);
+
        default:
            CV_Error( Error::StsBadFlag, "Unsopported method" );
            return Ptr<gpu::TemplateMatching>();
        }
    }
    else
    {
        switch (method)
        {
        case TM_SQDIFF:
            return new Match_SQDIFF_8U(user_block_size);
        case TM_SQDIFF_NORMED:
            return new Match_SQDIFF_NORMED_8U(user_block_size);
        case TM_CCORR:
            return new Match_CCORR_8U(user_block_size);
        case TM_CCORR_NORMED:
            return new Match_CCORR_NORMED_8U(user_block_size);
        case TM_CCOEFF:
            return new Match_CCOEFF_8U(user_block_size);
        case TM_CCOEFF_NORMED:
            return new Match_CCOEFF_NORMED_8U(user_block_size);
        default:
            CV_Error( Error::StsBadFlag, "Unsopported method" );
            return Ptr<gpu::TemplateMatching>();
        }
    }
 }
 #endif
--- a/modules/gpuimgproc/test/test_match_template.cpp
+++ b/modules/gpuimgproc/test/test_match_template.cpp
@@ -82,8 +82,10 @@ GPU_TEST_P(MatchTemplate8U, Accuracy)
    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_8U, cn));
    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_8U, cn));
    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
    cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+    alg->match(loadMat(image), loadMat(templ), dst);
    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);
@@ -128,8 +130,10 @@ GPU_TEST_P(MatchTemplate32F, Regression)
    cv::Mat image = randomMat(size, CV_MAKETYPE(CV_32F, cn));
    cv::Mat templ = randomMat(templ_size, CV_MAKETYPE(CV_32F, cn));
    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
    cv::gpu::GpuMat dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(templ), dst, method);
+    alg->match(loadMat(image), loadMat(templ), dst);
    cv::Mat dst_gold;
    cv::matchTemplate(image, templ, dst_gold, method);
@@ -169,8 +173,10 @@ GPU_TEST_P(MatchTemplateBlackSource, Accuracy)
    cv::Mat pattern = readImage("matchtemplate/cat.png");
    ASSERT_FALSE(pattern.empty());
    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), method);
    cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, method);
+    alg->match(loadMat(image), loadMat(pattern), d_dst);
    cv::Mat dst(d_dst);
@@ -214,8 +220,10 @@ GPU_TEST_P(MatchTemplate_CCOEF_NORMED, Accuracy)
    cv::Mat pattern = readImage(patternName);
    ASSERT_FALSE(pattern.empty());
    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(image.type(), cv::TM_CCOEFF_NORMED);
    cv::gpu::GpuMat d_dst;
-    cv::gpu::matchTemplate(loadMat(image), loadMat(pattern), d_dst, cv::TM_CCOEFF_NORMED);
+    alg->match(loadMat(image), loadMat(pattern), d_dst);
    cv::Mat dst(d_dst);
@@ -263,8 +271,10 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF_NORMED)
    cv::Mat templ = readImage("matchtemplate/template.png");
    ASSERT_FALSE(templ.empty());
    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(scene.type(), cv::TM_SQDIFF_NORMED);
    cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF_NORMED);
+    alg->match(loadMat(scene), loadMat(templ), d_result);
    cv::Mat result(d_result);
@@ -286,8 +296,10 @@ GPU_TEST_P(MatchTemplate_CanFindBigTemplate, SQDIFF)
    cv::Mat templ = readImage("matchtemplate/template.png");
    ASSERT_FALSE(templ.empty());
    cv::Ptr<cv::gpu::TemplateMatching> alg = cv::gpu::createTemplateMatching(scene.type(), cv::TM_SQDIFF);
    cv::gpu::GpuMat d_result;
-    cv::gpu::matchTemplate(loadMat(scene), loadMat(templ), d_result, cv::TM_SQDIFF);
+    alg->match(loadMat(scene), loadMat(templ), d_result);
    cv::Mat result(d_result);
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -17,24 +17,16 @@
 using namespace std;
 using namespace cv;
 static void InitMatchTemplate()
 {
    Mat src; gen(src, 500, 500, CV_32F, 0, 1);
    Mat templ; gen(templ, 500, 500, CV_32F, 0, 1);
    gpu::GpuMat d_src(src), d_templ(templ), d_dst;
    gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
 }
 TEST(matchTemplate)
 {
    InitMatchTemplate();
    Mat src, templ, dst;
    gen(src, 3000, 3000, CV_32F, 0, 1);
    gpu::GpuMat d_src(src), d_templ, d_dst;
    Ptr<gpu::TemplateMatching> alg = gpu::createTemplateMatching(src.type(), TM_CCORR);
    for (int templ_size = 5; templ_size < 200; templ_size *= 5)
    {
        SUBTEST << src.cols << 'x' << src.rows << ", 32FC1" << ", templ " << templ_size << 'x' << templ_size << ", CCORR";
@@ -47,10 +39,10 @@ TEST(matchTemplate)
        CPU_OFF;
        d_templ.upload(templ);
-        gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
+        alg->match(d_src, d_templ, d_dst);
        GPU_ON;
-        gpu::matchTemplate(d_src, d_templ, d_dst, TM_CCORR);
+        alg->match(d_src, d_templ, d_dst);
        GPU_OFF;
    }
 }