From 77501f3ed0becb50aa1939611340f6ef9e985927 Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Mon, 8 Apr 2013 15:19:44 +0800
Subject: [PATCH 1/3] ocl: Add dft based convolve implementation.

Match template in ocl module now can be utilized with dft based
convolve. Note this feature needs OpenCV to be built with clAmdFft
library.
---
 modules/ocl/include/opencv2/ocl.hpp           |  24 ++-
 modules/ocl/src/imgproc.cpp                   | 159 +++++++++++++++++-
 modules/ocl/src/match_template.cpp            |  30 +++-
 .../opencl/imgproc_mulAndScaleSpectrums.cl    |  96 +++++++++++
 modules/ocl/test/test_fft.cpp                 | 134 +++++++++++++++
 5 files changed, 432 insertions(+), 11 deletions(-)
 create mode 100644 modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
diff --git a/modules/ocl/include/opencv2/ocl.hpp b/modules/ocl/include/opencv2/ocl.hpp
index f79e6b818..3145c6098 100644
--- a/modules/ocl/include/opencv2/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl.hpp
@@ -540,9 +540,29 @@ namespace cv
         CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2);
         CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2);
 
-        //! computes convolution of two images
+        struct CV_EXPORTS ConvolveBuf
+        {
+            Size result_size;
+            Size block_size;
+            Size user_block_size;
+            Size dft_size;
+
+            oclMat image_spect, templ_spect, result_spect;
+            oclMat image_block, templ_block, result_data;
+
+            void create(Size image_size, Size templ_size);
+            static Size estimateBlockSize(Size result_size, Size templ_size);
+        };
+
+        //! computes convolution of two images, may use discrete Fourier transform
         //! support only CV_32FC1 type
-        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result);
+        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false);
+        CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf);
+
+        //! Performs a per-element multiplication of two Fourier spectrums.
+        //! Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now.
+        //! support only CV_32FC2 type
+        CV_EXPORTS void mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int flags, float scale, bool conjB = false);
 
         CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code , int dcn = 0);
 
diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index 47c71477e..e3b3d2522 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -25,6 +25,7 @@
 //    Xu Pang, pangxu010@163.com
 //    Wu Zailong, bullet@yeah.net
 //    Wenju He, wenju@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
@@ -79,6 +80,7 @@ namespace cv
         extern const char *imgproc_calcHarris;
         extern const char *imgproc_calcMinEigenVal;
         extern const char *imgproc_convolve;
+        extern const char *imgproc_mulAndScaleSpectrums;
         ////////////////////////////////////OpenCL call wrappers////////////////////////////
 
         template <typename T> struct index_and_sizeof;
@@ -1585,11 +1587,148 @@ namespace cv
 
     }
 }
+//////////////////////////////////mulSpectrums////////////////////////////////////////////////////
+void cv::ocl::mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int /*flags*/, float scale, bool conjB)
+{
+    CV_Assert(a.type() == CV_32FC2);
+    CV_Assert(b.type() == CV_32FC2);
+
+    c.create(a.size(), CV_32FC2);
+
+    size_t lt[3]  = { 16, 16, 1 };
+    size_t gt[3]  = { a.cols, a.rows, 1 };
+
+    String kernelName = conjB ? "mulAndScaleSpectrumsKernel_CONJ":"mulAndScaleSpectrumsKernel";
+
+    std::vector<std::pair<size_t , const void *> > args;
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&a.data ));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&b.data ));
+    args.push_back( std::make_pair( sizeof(cl_float), (void *)&scale));
+    args.push_back( std::make_pair( sizeof(cl_mem), (void *)&c.data ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.cols ));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.rows));
+    args.push_back( std::make_pair( sizeof(cl_int), (void *)&a.step ));
+
+    Context *clCxt = Context::getContext();
+    openCLExecuteKernel(clCxt, &imgproc_mulAndScaleSpectrums, kernelName, gt, lt, args, -1, -1);
+}
 //////////////////////////////////convolve////////////////////////////////////////////////////
 inline int divUp(int total, int grain)
 {
     return (total + grain - 1) / grain;
 }
+
+// ported from CUDA module
+void cv::ocl::ConvolveBuf::create(Size image_size, Size templ_size)
+{
+    result_size = Size(image_size.width - templ_size.width + 1,
+                       image_size.height - templ_size.height + 1);
+
+    block_size = user_block_size;
+    if (user_block_size.width == 0 || user_block_size.height == 0)
+        block_size = estimateBlockSize(result_size, templ_size);
+
+    dft_size.width  = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.)));
+    dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.)));
+
+    // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192),
+    // see CUDA Toolkit 4.1 CUFFT Library Programming Guide
+    //if (dft_size.width > 8192)
+    dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1.);
+    //if (dft_size.height > 8192)
+    dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1.);
+
+    // To avoid wasting time doing small DFTs
+    dft_size.width = std::max(dft_size.width, 512);
+    dft_size.height = std::max(dft_size.height, 512);
+
+    image_block.create(dft_size, CV_32F);
+    templ_block.create(dft_size, CV_32F);
+    result_data.create(dft_size, CV_32F);
+
+    //spect_len = dft_size.height * (dft_size.width / 2 + 1);
+    image_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
+    templ_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
+    result_spect.create(dft_size.height, dft_size.width / 2 + 1, CV_32FC2);
+
+    // Use maximum result matrix block size for the estimated DFT block size
+    block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width);
+    block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height);
+}
+
+Size cv::ocl::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/)
+{
+    int width = (result_size.width + 2) / 3;
+    int height = (result_size.height + 2) / 3;
+    width = std::min(width, result_size.width);
+    height = std::min(height, result_size.height);    
+    return Size(width, height);
+}
+
+static void convolve_run_fft(const oclMat &image, const oclMat &templ, oclMat &result, bool ccorr, ConvolveBuf& buf)
+{
+#if defined HAVE_CLAMDFFT
+    CV_Assert(image.type() == CV_32F);
+    CV_Assert(templ.type() == CV_32F);
+
+    buf.create(image.size(), templ.size());
+    result.create(buf.result_size, CV_32F);
+
+    Size& block_size = buf.block_size;
+    Size& dft_size = buf.dft_size;
+
+    oclMat& image_block = buf.image_block;
+    oclMat& templ_block = buf.templ_block;
+    oclMat& result_data = buf.result_data;
+
+    oclMat& image_spect = buf.image_spect;
+    oclMat& templ_spect = buf.templ_spect;
+    oclMat& result_spect = buf.result_spect;
+
+    oclMat templ_roi = templ;
+    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0,
+                   templ_block.cols - templ_roi.cols, 0, Scalar());
+
+    cv::ocl::dft(templ_block, templ_spect, dft_size);
+
+    // Process all blocks of the result matrix
+    for (int y = 0; y < result.rows; y += block_size.height)
+    {
+        for (int x = 0; x < result.cols; x += block_size.width)
+        {
+            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                std::min(y + dft_size.height, image.rows) - y);
+            Rect roi0(x, y, image_roi_size.width, image_roi_size.height);
+
+            oclMat image_roi(image, roi0);
+
+            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                           0, image_block.cols - image_roi.cols, 0, Scalar());
+
+            cv::ocl::dft(image_block, image_spect, dft_size);
+
+            mulSpectrums(image_spect, templ_spect, result_spect, 0,
+                                 1.f / dft_size.area(), ccorr);
+
+            cv::ocl::dft(result_spect, result_data, dft_size, cv::DFT_INVERSE | cv::DFT_REAL_OUTPUT);
+
+            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                 std::min(y + block_size.height, result.rows) - y);
+            
+            Rect roi1(x, y, result_roi_size.width, result_roi_size.height);
+            Rect roi2(0, 0, result_roi_size.width, result_roi_size.height);
+
+            oclMat result_roi(result, roi1);
+            oclMat result_block(result_data, roi2);
+
+            result_block.copyTo(result_roi);
+        }
+    }
+
+#else
+    CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
+#endif
+}
 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, String kernelName, const char **kernelString)
 {
     CV_Assert(src.depth() == CV_32FC1);
@@ -1630,13 +1769,25 @@ static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, St
 
     openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
 }
-void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y)
+void cv::ocl::convolve(const oclMat &x, const oclMat &t, oclMat &y, bool ccorr)
 {
     CV_Assert(x.depth() == CV_32F);
     CV_Assert(t.depth() == CV_32F);
-    CV_Assert(x.type() == y.type() && x.size() == y.size());
     y.create(x.size(), x.type());
     String kernelName = "convolve";
-
-    convolve_run(x, t, y, kernelName, &imgproc_convolve);
+    if(t.cols > 17 || t.rows > 17)
+    {
+        ConvolveBuf buf;
+        convolve_run_fft(x, t, y, ccorr, buf);
+    }
+    else
+    {
+        CV_Assert(ccorr == false);
+        convolve_run(x, t, y, kernelName, &imgproc_convolve);
+    }
+}
+void cv::ocl::convolve(const oclMat &image, const oclMat &templ, oclMat &result, bool ccorr, ConvolveBuf& buf)
+{
+    result.create(image.size(), image.type());
+    convolve_run_fft(image, templ, result, ccorr, buf);
 }
diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp
index 40c544301..9dee1f4ea 100644
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@@ -98,11 +98,22 @@ namespace cv
         // Evaluates optimal template's area threshold. If
         // template's area is less  than the threshold, we use naive match
         // template version, otherwise FFT-based (if available)
-        static bool useNaive(int , int , Size )
+        static bool useNaive(int method, int depth, Size size)
         {
-            // FIXME!
-            //   always use naive until convolve is imported
+#ifdef HAVE_CLAMDFFT
+            if (method == CV_TM_SQDIFF && (depth == CV_32F || !Context::getContext()->supportsFeature(Context::CL_DOUBLE)))
+            {
+                return true;
+            }
+            else if(method == CV_TM_CCORR || (method == CV_TM_SQDIFF && depth == CV_8U))
+            {
+                return size.height < 18 && size.width < 18;
+            }
+            else
+                return false;
+#else
             return true;
+#endif
         }
 
         //////////////////////////////////////////////////////////////////////
@@ -223,9 +234,18 @@ namespace cv
         //////////////////////////////////////////////////////////////////////
         // CCORR
         void convolve_32F(
-            const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
+            const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
         {
-            CV_Error(-1, "convolve is not fully implemented yet");
+            ConvolveBuf convolve_buf;
+            convolve_buf.user_block_size = buf.user_block_size;
+            if (image.oclchannels() == 1)
+                convolve(image, templ, result, true, convolve_buf);
+            else
+            {
+                oclMat result_;
+                convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf);
+                extractFirstChannel_32F(result_, result);
+            }
         }
 
         void matchTemplate_CCORR(
diff --git a/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl b/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
new file mode 100644
index 000000000..7d3774d07
--- /dev/null
+++ b/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
@@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the uintel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business uinterruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+typedef float2 cfloat;
+inline cfloat cmulf(cfloat a, cfloat b)
+{
+    return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
+}
+
+inline cfloat conjf(cfloat a)
+{
+    return (cfloat)( a.x, - a.y );
+}
+
+__kernel void 
+    mulAndScaleSpectrumsKernel(
+    __global const cfloat* a, 
+    __global const cfloat* b, 
+    float scale, 
+    __global cfloat* dst, 
+    uint cols, 
+    uint rows, 
+    uint mstep
+    )
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], b[idx]);
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
+__kernel void 
+    mulAndScaleSpectrumsKernel_CONJ(
+    __global const cfloat* a, 
+    __global const cfloat* b, 
+    float scale, 
+    __global cfloat* dst, 
+    uint cols, 
+    uint rows, 
+    uint mstep
+    )
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], conjf(b[idx]));
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
diff --git a/modules/ocl/test/test_fft.cpp b/modules/ocl/test/test_fft.cpp
index 0fee8b03d..d19a47132 100644
--- a/modules/ocl/test/test_fft.cpp
+++ b/modules/ocl/test/test_fft.cpp
@@ -103,4 +103,138 @@ INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Dft, testing::Combine(
                             testing::Values(cv::Size(2, 3), cv::Size(5, 4), cv::Size(25, 20), cv::Size(512, 1), cv::Size(1024, 768)),
                             testing::Values(0, (int)cv::DFT_ROWS, (int)cv::DFT_SCALE) ));
 
+////////////////////////////////////////////////////////////////////////////
+// MulSpectrums
+
+PARAM_TEST_CASE(MulSpectrums, cv::Size, DftFlags, bool)
+{
+    cv::Size size;
+    int flag;
+    bool ccorr;
+    cv::Mat a, b;
+
+    virtual void SetUp()
+    {
+        size  = GET_PARAM(0);
+        flag  = GET_PARAM(1);
+        ccorr = GET_PARAM(2);
+
+        a = randomMat(size, CV_32FC2);
+        b = randomMat(size, CV_32FC2);
+    }
+};
+
+TEST_P(MulSpectrums, Simple)
+{
+    cv::ocl::oclMat c;
+    cv::ocl::mulSpectrums(cv::ocl::oclMat(a), cv::ocl::oclMat(b), c, flag, 1.0, ccorr);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, ccorr);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2, "");
+}
+
+TEST_P(MulSpectrums, Scaled)
+{
+    float scale = 1.f / size.area();
+
+    cv::ocl::oclMat c;
+    cv::ocl::mulSpectrums(cv::ocl::oclMat(a), cv::ocl::oclMat(b), c, flag, scale, ccorr);
+
+    cv::Mat c_gold;
+    cv::mulSpectrums(a, b, c_gold, flag, ccorr);
+    c_gold.convertTo(c_gold, c_gold.type(), scale);
+
+    EXPECT_MAT_NEAR(c_gold, c, 1e-2, "");
+}
+
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(
+    DIFFERENT_SIZES,
+    testing::Values(DftFlags(0)),
+    testing::Values(false, true)));
+
+
+////////////////////////////////////////////////////////
+// Convolve
+
+void static convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+{
+    // reallocate the output array if needed
+    C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
+    cv::Size dftSize;
+
+    // compute the size of DFT transform
+    dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
+    dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
+
+    // allocate temporary buffers and initialize them with 0s
+    cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
+    cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
+
+    // copy A and B to the top-left corners of tempA and tempB, respectively
+    cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
+    A.copyTo(roiA);
+    cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
+    B.copyTo(roiB);
+
+    // now transform the padded A & B in-place;
+    // use "nonzeroRows" hint for faster processing
+    cv::dft(tempA, tempA, 0, A.rows);
+    cv::dft(tempB, tempB, 0, B.rows);
+
+    // multiply the spectrums;
+    // the function handles packed spectrum representations well
+    cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
+
+    // transform the product back from the frequency domain.
+    // Even though all the result rows will be non-zero,
+    // you need only the first C.rows of them, and thus you
+    // pass nonzeroRows == C.rows
+    cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
+
+    // now copy the result back to C.
+    tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+}
+
+IMPLEMENT_PARAM_CLASS(KSize, int);
+IMPLEMENT_PARAM_CLASS(Ccorr, bool);
+
+PARAM_TEST_CASE(Convolve_DFT, cv::Size, KSize, Ccorr)
+{
+    cv::Size size;
+    int ksize;
+    bool ccorr;
+
+    cv::Mat src;
+    cv::Mat kernel;
+
+    cv::Mat dst_gold;
+
+    virtual void SetUp()
+    {
+        size  = GET_PARAM(0);
+        ksize = GET_PARAM(1);
+        ccorr = GET_PARAM(2);
+    }
+};
+
+TEST_P(Convolve_DFT, Accuracy)
+{
+    cv::Mat src = randomMat(size, CV_32FC1, 0.0, 100.0);
+    cv::Mat kernel = randomMat(cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0);
+
+    cv::ocl::oclMat dst;
+    cv::ocl::convolve(cv::ocl::oclMat(src), cv::ocl::oclMat(kernel), dst, ccorr);
+    
+    cv::Mat dst_gold;
+    convolveDFT(src, kernel, dst_gold, ccorr);
+
+    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1, "");
+}
+#define DIFFERENT_CONVOLVE_SIZES testing::Values(cv::Size(251, 257), cv::Size(113, 113), cv::Size(200, 480), cv::Size(1300, 1300))
+INSTANTIATE_TEST_CASE_P(OCL_ImgProc, Convolve_DFT, testing::Combine(
+    DIFFERENT_CONVOLVE_SIZES,
+    testing::Values(KSize(19), KSize(23), KSize(45)),
+    testing::Values(Ccorr(true)/*, Ccorr(false)*/))); // false ccorr cannot pass for some instances
 #endif // HAVE_CLAMDFFT

From 3fea2620e69e82e7c933551ce89cc5cbb547dc41 Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Mon, 8 Apr 2013 16:22:20 +0800
Subject: [PATCH 2/3] Fix some compilation errors and warnings.

---
 modules/ocl/src/imgproc.cpp                   |  7 ++--
 modules/ocl/src/match_template.cpp            |  3 ++
 .../opencl/imgproc_mulAndScaleSpectrums.cl    | 36 +++++++++----------
 modules/ocl/test/test_fft.cpp                 |  2 +-
 4 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/modules/ocl/src/imgproc.cpp b/modules/ocl/src/imgproc.cpp
index e3b3d2522..5ffb9dcf8 100644
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -1661,7 +1661,7 @@ Size cv::ocl::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size
     int width = (result_size.width + 2) / 3;
     int height = (result_size.height + 2) / 3;
     width = std::min(width, result_size.width);
-    height = std::min(height, result_size.height);    
+    height = std::min(height, result_size.height);
     return Size(width, height);
 }
 
@@ -1714,7 +1714,7 @@ static void convolve_run_fft(const oclMat &image, const oclMat &templ, oclMat &r
 
             Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
                                  std::min(y + block_size.height, result.rows) - y);
-            
+
             Rect roi1(x, y, result_roi_size.width, result_roi_size.height);
             Rect roi2(0, 0, result_roi_size.width, result_roi_size.height);
 
@@ -1727,6 +1727,9 @@ static void convolve_run_fft(const oclMat &image, const oclMat &templ, oclMat &r
 
 #else
     CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
+#define UNUSED(x) (void)(x);
+    UNUSED(image) UNUSED(templ) UNUSED(result) UNUSED(ccorr) UNUSED(buf)
+#undef UNUSED
 #endif
 }
 static void convolve_run(const oclMat &src, const oclMat &temp1, oclMat &dst, String kernelName, const char **kernelString)
diff --git a/modules/ocl/src/match_template.cpp b/modules/ocl/src/match_template.cpp
index 9dee1f4ea..77f7b1d63 100644
--- a/modules/ocl/src/match_template.cpp
+++ b/modules/ocl/src/match_template.cpp
@@ -112,6 +112,9 @@ namespace cv
             else
                 return false;
 #else
+#define UNUSED(x) (void)(x);
+            UNUSED(method) UNUSED(depth) UNUSED(size)
+#undef  UNUSED
             return true;
 #endif
         }
diff --git a/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl b/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
index 7d3774d07..86d4e5d52 100644
--- a/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
+++ b/modules/ocl/src/opencl/imgproc_mulAndScaleSpectrums.cl
@@ -54,16 +54,16 @@ inline cfloat conjf(cfloat a)
     return (cfloat)( a.x, - a.y );
 }
 
-__kernel void 
-    mulAndScaleSpectrumsKernel(
-    __global const cfloat* a, 
-    __global const cfloat* b, 
-    float scale, 
-    __global cfloat* dst, 
-    uint cols, 
-    uint rows, 
+__kernel void
+mulAndScaleSpectrumsKernel(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
     uint mstep
-    )
+)
 {
     const uint x = get_global_id(0);
     const uint y = get_global_id(1);
@@ -74,16 +74,16 @@ __kernel void
         dst[idx] = (cfloat)( v.x * scale, v.y * scale );
     }
 }
-__kernel void 
-    mulAndScaleSpectrumsKernel_CONJ(
-    __global const cfloat* a, 
-    __global const cfloat* b, 
-    float scale, 
-    __global cfloat* dst, 
-    uint cols, 
-    uint rows, 
+__kernel void
+mulAndScaleSpectrumsKernel_CONJ(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
     uint mstep
-    )
+)
 {
     const uint x = get_global_id(0);
     const uint y = get_global_id(1);
diff --git a/modules/ocl/test/test_fft.cpp b/modules/ocl/test/test_fft.cpp
index d19a47132..030ea1ff1 100644
--- a/modules/ocl/test/test_fft.cpp
+++ b/modules/ocl/test/test_fft.cpp
@@ -226,7 +226,7 @@ TEST_P(Convolve_DFT, Accuracy)
 
     cv::ocl::oclMat dst;
     cv::ocl::convolve(cv::ocl::oclMat(src), cv::ocl::oclMat(kernel), dst, ccorr);
-    
+
     cv::Mat dst_gold;
     convolveDFT(src, kernel, dst_gold, ccorr);
 

From 143f8f69d6a49f18555e9a59abfb3066b4eb4bfb Mon Sep 17 00:00:00 2001
From: peng xiao <hisenxpress@gmail.com>
Date: Mon, 8 Apr 2013 17:15:52 +0800
Subject: [PATCH 3/3] Add some documentation on ocl::convolve

---
 modules/ocl/doc/image_filtering.rst | 41 ++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/modules/ocl/doc/image_filtering.rst b/modules/ocl/doc/image_filtering.rst
index ca97d3a93..3da5d3ded 100644
--- a/modules/ocl/doc/image_filtering.rst
+++ b/modules/ocl/doc/image_filtering.rst
@@ -109,17 +109,52 @@ Returns void
 
 The function calculates the Laplacian of the source image by adding up the second x and y derivatives calculated using the Sobel operator.
 
+ocl::ConvolveBuf
+----------------
+.. ocv:struct:: ocl::ConvolveBuf
+
+Class providing a memory buffer for :ocv:func:`ocl::convolve` function, plus it allows to adjust some specific parameters. ::
+
+    struct CV_EXPORTS ConvolveBuf
+    {
+        Size result_size;
+        Size block_size;
+        Size user_block_size;
+        Size dft_size;
+        int spect_len;
+
+        oclMat image_spect, templ_spect, result_spect;
+        oclMat image_block, templ_block, result_data;
+
+        void create(Size image_size, Size templ_size);
+        static Size estimateBlockSize(Size result_size, Size templ_size);
+    };
+
+You can use field `user_block_size` to set specific block size for :ocv:func:`ocl::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed.
+
+ocl::ConvolveBuf::create
+------------------------
+.. ocv:function:: ocl::ConvolveBuf::create(Size image_size, Size templ_size)
+
+Constructs a buffer for :ocv:func:`ocl::convolve` function with respective arguments.
+
 ocl::convolve
 ------------------
 Returns void
 
-.. ocv:function:: void ocl::convolve(const oclMat &image, const oclMat &temp1, oclMat &result)
+.. ocv:function:: void ocl::convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr=false)
 
-    :param image: The source image
+.. ocv:function:: void ocl::convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf)
 
-    :param temp1: Convolution kernel, a single-channel floating point matrix.
+    :param image: The source image. Only  ``CV_32FC1`` images are supported for now.
+
+    :param temp1: Convolution kernel, a single-channel floating point matrix. The size is not greater than the  ``image`` size. The type is the same as  ``image``.
 
     :param result: The destination image
+    
+    :param ccorr: Flags to evaluate cross-correlation instead of convolution.
+    
+    :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:struct:`ocl::ConvolveBuf`.
 
 Convolves an image with the kernel. Supports only CV_32FC1 data types and do not support ROI.