From c776bff95bf6605a6842fb5a73cc2e87832e0404 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Wed, 28 Mar 2012 07:11:07 +0000 Subject: [PATCH] #1713 Added the possibility of setting user_block_size manually for gpu::matchTemplate function (and gpu::convolve). Added a buffer param into these functions. Removed using of 2^n block sizes when it's not necessary. --- modules/gpu/doc/image_processing.rst | 84 +++++---- modules/gpu/include/opencv2/gpu/gpu.hpp | 42 +++-- modules/gpu/src/cuda/match_template.cu | 24 +-- modules/gpu/src/imgproc.cpp | 37 ++-- modules/gpu/src/match_template.cpp | 225 ++++++++++++------------ 5 files changed, 223 insertions(+), 189 deletions(-) diff --git a/modules/gpu/doc/image_processing.rst b/modules/gpu/doc/image_processing.rst index 90861c0e6..0a97365e1 100644 --- a/modules/gpu/doc/image_processing.rst +++ b/modules/gpu/doc/image_processing.rst @@ -234,6 +234,35 @@ The source matrix should be continuous, otherwise reallocation and data copying .. seealso:: :ocv:func:`dft` +gpu::ConvolveBuf +---------------- +.. ocv:class:: gpu::ConvolveBuf + +Class providing a memory buffer for :ocv:func:`gpu::convolve` function, plus it allows to adjust some specific parameters. :: + + struct CV_EXPORTS ConvolveBuf + { + Size result_size; + Size block_size; + Size user_block_size; + Size dft_size; + int spect_len; + + GpuMat image_spect, templ_spect, result_spect; + GpuMat image_block, templ_block, result_data; + + void create(Size image_size, Size templ_size); + static Size estimateBlockSize(Size result_size, Size templ_size); + }; + +You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed. + +gpu::ConvolveBuf::create +------------------------ +.. ocv:function:: ConvolveBuf::create(Size image_size, Size templ_size) + +Constructs a buffer for :ocv:func:`gpu::convolve` function with respective arguments. + gpu::convolve ----------------- @@ -241,7 +270,7 @@ Computes a convolution (or cross-correlation) of two images. .. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr=false) -.. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf) +.. ocv:function:: void gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream &stream = Stream::Null()) :param image: Source image. Only ``CV_32FC1`` images are supported for now. @@ -251,50 +280,36 @@ Computes a convolution (or cross-correlation) of two images. :param ccorr: Flags to evaluate cross-correlation instead of convolution. - :param buf: Optional buffer to avoid extra memory allocations (for many calls with the same sizes). + :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:class:`gpu::ConvolveBuf`. + + :param stream: Stream for the asynchronous version. .. seealso:: :ocv:func:`gpu::filter2D` +gpu::MatchTemplateBuf +--------------------- +.. ocv:class:: gpu::MatchTemplateBuf +Class providing memory buffers for :ocv:func:`gpu::matchTemplate` function, plus it allows to adjust some specific parameters. :: -gpu::ConvolveBuf ----------------- -.. ocv:class:: gpu::ConvolveBuf - -Class providing a memory buffer for the :ocv:func:`gpu::convolve` function. :: - - struct CV_EXPORTS ConvolveBuf + struct CV_EXPORTS MatchTemplateBuf { - ConvolveBuf() {} - ConvolveBuf(Size image_size, Size templ_size) - { create(image_size, templ_size); } - void create(Size image_size, Size templ_size); - - private: - // Hidden + Size user_block_size; + GpuMat imagef, templf; + std::vector images; + std::vector image_sums; + std::vector image_sqsums; }; - - -gpu::ConvolveBuf::ConvolveBuf ---------------------------------- -The constructors. - -.. ocv:function:: ConvolveBuf::ConvolveBuf() - -Constructs an empty buffer that is properly resized after the first call of the :ocv:func:`gpu::convolve` function. - -.. ocv:function:: ConvolveBuf::ConvolveBuf(Size image_size, Size templ_size) - -Constructs a buffer for the :ocv:func:`gpu::convolve` function with respective arguments. - - +You can use field `user_block_size` to set specific block size for :ocv:func:`gpu::matchTemplate` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed. gpu::matchTemplate ---------------------- Computes a proximity map for a raster template and an image where the template is searched for. -.. ocv:function:: void gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method) +.. ocv:function:: void gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null()) + +.. ocv:function:: void gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null()) :param image: Source image. ``CV_32F`` and ``CV_8U`` depth images (1..4 channels) are supported for now. @@ -303,6 +318,10 @@ Computes a proximity map for a raster template and an image where the template i :param result: Map containing comparison results ( ``CV_32FC1`` ). If ``image`` is *W x H* and ``templ`` is *w x h*, then ``result`` must be *W-w+1 x H-h+1*. :param method: Specifies the way to compare the template with the image. + + :param buf: Optional buffer to avoid extra memory allocations and to adjust some specific parameters. See :ocv:class:`gpu::MatchTemplateBuf`. + + :param stream: Stream for the asynchronous version. The following methods are supported for the ``CV_8U`` depth images for now: @@ -321,7 +340,6 @@ Computes a proximity map for a raster template and an image where the template i .. seealso:: :ocv:func:`matchTemplate` - gpu::remap -------------- Applies a generic geometrical transformation to an image. diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 0cf85cebd..ef7ce713c 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -716,36 +716,42 @@ CV_EXPORTS void mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c //! For complex-to-real transform it is assumed that the source matrix is packed in CUFFT's format. CV_EXPORTS void dft(const GpuMat& src, GpuMat& dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); -//! computes convolution (or cross-correlation) of two images using discrete Fourier transform -//! supports source images of 32FC1 type only -//! result matrix will have 32FC1 type -struct CV_EXPORTS ConvolveBuf; -CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); -CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); - struct CV_EXPORTS ConvolveBuf { - ConvolveBuf() {} - ConvolveBuf(Size image_size, Size templ_size) - { create(image_size, templ_size); } - void create(Size image_size, Size templ_size); - void create(Size image_size, Size templ_size, Size block_size); - -private: - static Size estimateBlockSize(Size result_size, Size templ_size); - friend void convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream); - Size result_size; Size block_size; + Size user_block_size; Size dft_size; int spect_len; GpuMat image_spect, templ_spect, result_spect; GpuMat image_block, templ_block, result_data; + + void create(Size image_size, Size templ_size); + static Size estimateBlockSize(Size result_size, Size templ_size); +}; + + +//! computes convolution (or cross-correlation) of two images using discrete Fourier transform +//! supports source images of 32FC1 type only +//! result matrix will have 32FC1 type +CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr = false); +CV_EXPORTS void convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream = Stream::Null()); + +struct CV_EXPORTS MatchTemplateBuf +{ + Size user_block_size; + GpuMat imagef, templf; + std::vector images; + std::vector image_sums; + std::vector image_sqsums; }; //! computes the proximity map for the raster template and the image where the template is searched for -CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream = Stream::Null()); +CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream &stream = Stream::Null()); + +//! computes the proximity map for the raster template and the image where the template is searched for +CV_EXPORTS void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, MatchTemplateBuf &buf, Stream& stream = Stream::Null()); //! smoothes the source image and downsamples it CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()); diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index 3d4f8eb43..0d4442182 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -559,7 +559,7 @@ namespace cv { namespace gpu { namespace device void matchTemplatePrepared_CCOFF_NORMED_8U( int w, int h, const DevMem2D_ image_sum, const DevMem2D_ image_sqsum, - unsigned int templ_sum, unsigned int templ_sqsum, + unsigned int templ_sum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); @@ -618,8 +618,8 @@ namespace cv { namespace gpu { namespace device int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, - unsigned int templ_sum_r, unsigned int templ_sqsum_r, - unsigned int templ_sum_g, unsigned int templ_sqsum_g, + unsigned int templ_sum_r, unsigned long long templ_sqsum_r, + unsigned int templ_sum_g, unsigned long long templ_sqsum_g, DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); @@ -694,9 +694,9 @@ namespace cv { namespace gpu { namespace device const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, const DevMem2D_ image_sum_b, const DevMem2D_ image_sqsum_b, - unsigned int templ_sum_r, unsigned int templ_sqsum_r, - unsigned int templ_sum_g, unsigned int templ_sqsum_g, - unsigned int templ_sum_b, unsigned int templ_sqsum_b, + unsigned int templ_sum_r, unsigned long long templ_sqsum_r, + unsigned int templ_sum_g, unsigned long long templ_sqsum_g, + unsigned int templ_sum_b, unsigned long long templ_sqsum_b, DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); @@ -782,10 +782,10 @@ namespace cv { namespace gpu { namespace device const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, const DevMem2D_ image_sum_b, const DevMem2D_ image_sqsum_b, const DevMem2D_ image_sum_a, const DevMem2D_ image_sqsum_a, - unsigned int templ_sum_r, unsigned int templ_sqsum_r, - unsigned int templ_sum_g, unsigned int templ_sqsum_g, - unsigned int templ_sum_b, unsigned int templ_sqsum_b, - unsigned int templ_sum_a, unsigned int templ_sqsum_a, + unsigned int templ_sum_r, unsigned long long templ_sqsum_r, + unsigned int templ_sum_g, unsigned long long templ_sqsum_g, + unsigned int templ_sum_b, unsigned long long templ_sqsum_b, + unsigned int templ_sum_a, unsigned long long templ_sqsum_a, DevMem2Df result, cudaStream_t stream) { dim3 threads(32, 8); @@ -822,7 +822,7 @@ namespace cv { namespace gpu { namespace device template __global__ void normalizeKernel_8U( int w, int h, const PtrStep image_sqsum, - unsigned int templ_sqsum, DevMem2Df result) + unsigned long long templ_sqsum, DevMem2Df result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; @@ -837,7 +837,7 @@ namespace cv { namespace gpu { namespace device } void normalize_8U(int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream) + unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp index 98f7fa406..0043ebd9c 100644 --- a/modules/gpu/src/imgproc.cpp +++ b/modules/gpu/src/imgproc.cpp @@ -1293,21 +1293,25 @@ void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size) { result_size = Size(image_size.width - templ_size.width + 1, image_size.height - templ_size.height + 1); - create(image_size, templ_size, estimateBlockSize(result_size, templ_size)); -} - -void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size, Size block_size) -{ - result_size = Size(image_size.width - templ_size.width + 1, - image_size.height - templ_size.height + 1); - - this->block_size = block_size; + block_size = user_block_size; + if (user_block_size.width == 0 || user_block_size.height == 0) + block_size = estimateBlockSize(result_size, templ_size); dft_size.width = 1 << int(ceil(std::log(block_size.width + templ_size.width - 1.) / std::log(2.))); dft_size.height = 1 << int(ceil(std::log(block_size.height + templ_size.height - 1.) / std::log(2.))); - if (dft_size.width < 512) dft_size.width = 512; - if (dft_size.height < 512) dft_size.height = 512; + + // CUFFT has hard-coded kernels for power-of-2 sizes (up to 8192), + // see CUDA Toolkit 4.1 CUFFT Library Programming Guide + if (dft_size.width > 8192) + dft_size.width = getOptimalDFTSize(block_size.width + templ_size.width - 1.); + if (dft_size.height > 8192) + dft_size.height = getOptimalDFTSize(block_size.height + templ_size.height - 1.); + + // To avoid wasting time doing small DFTs + dft_size.width = std::max(dft_size.width, 512); + dft_size.height = std::max(dft_size.height, 512); + createContinuous(dft_size, CV_32F, image_block); createContinuous(dft_size, CV_32F, templ_block); createContinuous(dft_size, CV_32F, result_data); @@ -1317,17 +1321,18 @@ void cv::gpu::ConvolveBuf::create(Size image_size, Size templ_size, Size block_s createContinuous(1, spect_len, CV_32FC2, templ_spect); createContinuous(1, spect_len, CV_32FC2, result_spect); - this->block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); - this->block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); + // Use maximum result matrix block size for the estimated DFT block size + block_size.width = std::min(dft_size.width - templ_size.width + 1, result_size.width); + block_size.height = std::min(dft_size.height - templ_size.height + 1, result_size.height); } -Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size templ_size) +Size cv::gpu::ConvolveBuf::estimateBlockSize(Size result_size, Size /*templ_size*/) { int width = (result_size.width + 2) / 3; int height = (result_size.height + 2) / 3; width = std::min(width, result_size.width); - height = std::min(height, result_size.height); + height = std::min(height, result_size.height); return Size(width, height); } @@ -1367,7 +1372,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, cufftHandle planR2C, planC2R; cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R)); - cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); + cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C)); cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) ); cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) ); diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp index 59b97e719..c4a818068 100644 --- a/modules/gpu/src/match_template.cpp +++ b/modules/gpu/src/match_template.cpp @@ -101,23 +101,23 @@ namespace cv { namespace gpu { namespace device void matchTemplatePrepared_CCOFF_NORMED_8U( int w, int h, const DevMem2D_ image_sum, const DevMem2D_ image_sqsum, - unsigned int templ_sum, unsigned int templ_sqsum, + unsigned int templ_sum, unsigned long long templ_sqsum, DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_NORMED_8UC2( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, - unsigned int templ_sum_r, unsigned int templ_sqsum_r, - unsigned int templ_sum_g, unsigned int templ_sqsum_g, + unsigned int templ_sum_r, unsigned long long templ_sqsum_r, + unsigned int templ_sum_g, unsigned long long templ_sqsum_g, DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_NORMED_8UC3( int w, int h, const DevMem2D_ image_sum_r, const DevMem2D_ image_sqsum_r, const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, const DevMem2D_ image_sum_b, const DevMem2D_ image_sqsum_b, - unsigned int templ_sum_r, unsigned int templ_sqsum_r, - unsigned int templ_sum_g, unsigned int templ_sqsum_g, - unsigned int templ_sum_b, unsigned int templ_sqsum_b, + unsigned int templ_sum_r, unsigned long long templ_sqsum_r, + unsigned int templ_sum_g, unsigned long long templ_sqsum_g, + unsigned int templ_sum_b, unsigned long long templ_sqsum_b, DevMem2Df result, cudaStream_t stream); void matchTemplatePrepared_CCOFF_NORMED_8UC4( int w, int h, @@ -125,14 +125,14 @@ namespace cv { namespace gpu { namespace device const DevMem2D_ image_sum_g, const DevMem2D_ image_sqsum_g, const DevMem2D_ image_sum_b, const DevMem2D_ image_sqsum_b, const DevMem2D_ image_sum_a, const DevMem2D_ image_sqsum_a, - unsigned int templ_sum_r, unsigned int templ_sqsum_r, - unsigned int templ_sum_g, unsigned int templ_sqsum_g, - unsigned int templ_sum_b, unsigned int templ_sqsum_b, - unsigned int templ_sum_a, unsigned int templ_sqsum_a, + unsigned int templ_sum_r, unsigned long long templ_sqsum_r, + unsigned int templ_sum_g, unsigned long long templ_sqsum_g, + unsigned int templ_sum_b, unsigned long long templ_sqsum_b, + unsigned int templ_sum_a, unsigned long long templ_sqsum_a, DevMem2Df result, cudaStream_t stream); void normalize_8U(int w, int h, const DevMem2D_ image_sqsum, - unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream); + unsigned long long templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream); void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream); } @@ -146,20 +146,6 @@ namespace // Evaluates optimal template's area threshold. If // template's area is less than the threshold, we use naive match // template version, otherwise FFT-based (if available) - int getTemplateThreshold(int method, int depth); - - void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result); - void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); - void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); - - void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result); - void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); - void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); - - void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); - void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result); - - int getTemplateThreshold(int method, int depth) { switch (method) @@ -177,8 +163,9 @@ namespace } - void matchTemplate_CCORR_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) - { + void matchTemplate_CCORR_32F( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) + { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F)) { @@ -186,14 +173,22 @@ namespace return; } - GpuMat result_; - ConvolveBuf buf; - convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream); - extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream)); + ConvolveBuf convolve_buf; + convolve_buf.user_block_size = buf.user_block_size; + + if (image.channels() == 1) + convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream); + else + { + GpuMat result_; + convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream); + extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream)); + } } - void matchTemplate_CCORR_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) + void matchTemplate_CCORR_8U( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U)) { @@ -202,41 +197,43 @@ namespace return; } - GpuMat imagef, templf; if (stream) { - stream.enqueueConvert(image, imagef, CV_32F); - stream.enqueueConvert(templ, templf, CV_32F); + stream.enqueueConvert(image, buf.imagef, CV_32F); + stream.enqueueConvert(templ, buf.templf, CV_32F); } else { - image.convertTo(imagef, CV_32F); - templ.convertTo(templf, CV_32F); + image.convertTo(buf.imagef, CV_32F); + templ.convertTo(buf.templf, CV_32F); } - matchTemplate_CCORR_32F(imagef, templf, result, stream); + matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream); } - void matchTemplate_CCORR_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) + void matchTemplate_CCORR_NORMED_8U( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { - matchTemplate_CCORR_8U(image, templ, result, stream); + matchTemplate_CCORR_8U(image, templ, result, buf, stream); - GpuMat img_sqsum; - sqrIntegral(image.reshape(1), img_sqsum, stream); + buf.image_sqsums.resize(1); + sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream); - unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0]; - normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); + unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; + normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) + void matchTemplate_SQDIFF_32F( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_SQDIFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) + void matchTemplate_SQDIFF_8U( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U)) { @@ -245,48 +242,48 @@ namespace return; } - GpuMat img_sqsum; - sqrIntegral(image.reshape(1), img_sqsum, stream); + buf.image_sqsums.resize(1); + sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream); unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; - matchTemplate_CCORR_8U(image, templ, result, stream); - matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); + matchTemplate_CCORR_8U(image, templ, result, buf, stream); + matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_SQDIFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) + void matchTemplate_SQDIFF_NORMED_8U( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { - GpuMat img_sqsum; - sqrIntegral(image.reshape(1), img_sqsum, stream); + buf.image_sqsums.resize(1); + sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream); unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; - matchTemplate_CCORR_8U(image, templ, result, stream); - matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); + matchTemplate_CCORR_8U(image, templ, result, buf, stream); + matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream)); } - void matchTemplate_CCOFF_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) + void matchTemplate_CCOFF_8U( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { - matchTemplate_CCORR_8U(image, templ, result, stream); + matchTemplate_CCORR_8U(image, templ, result, buf, stream); if (image.channels() == 1) { - GpuMat image_sum; - integral(image, image_sum, stream); + buf.image_sums.resize(1); + integral(image, buf.image_sums[0], stream); unsigned int templ_sum = (unsigned int)sum(templ)[0]; - matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sum, templ_sum, result, StreamAccessor::getStream(stream)); + matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream)); } else { - vector images; - vector image_sums(image.channels()); - - split(image, images); + split(image, buf.images); + buf.image_sums.resize(buf.images.size()); for (int i = 0; i < image.channels(); ++i) - integral(images[i], image_sums[i], stream); + integral(buf.images[i], buf.image_sums[i], stream); Scalar templ_sum = sum(templ); @@ -294,19 +291,19 @@ namespace { case 2: matchTemplatePrepared_CCOFF_8UC2( - templ.cols, templ.rows, image_sums[0], image_sums[1], + templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], result, StreamAccessor::getStream(stream)); break; case 3: matchTemplatePrepared_CCOFF_8UC3( - templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], + templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2], result, StreamAccessor::getStream(stream)); break; case 4: matchTemplatePrepared_CCOFF_8UC4( - templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3], + templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3], (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2], (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream)); break; @@ -317,46 +314,45 @@ namespace } - void matchTemplate_CCOFF_NORMED_8U(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream) + void matchTemplate_CCOFF_NORMED_8U( + const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream) { - GpuMat imagef, templf; if (stream) { - stream.enqueueConvert(image, imagef, CV_32F); - stream.enqueueConvert(templ, templf, CV_32F); + stream.enqueueConvert(image, buf.imagef, CV_32F); + stream.enqueueConvert(templ, buf.templf, CV_32F); } else { - image.convertTo(imagef, CV_32F); - templ.convertTo(templf, CV_32F); + image.convertTo(buf.imagef, CV_32F); + templ.convertTo(buf.templf, CV_32F); } - matchTemplate_CCORR_32F(imagef, templf, result, stream); + matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream); if (image.channels() == 1) { - GpuMat image_sum, image_sqsum; - integral(image, image_sum, stream); - sqrIntegral(image, image_sqsum, stream); + buf.image_sums.resize(1); + integral(image, buf.image_sums[0], stream); + buf.image_sqsums.resize(1); + sqrIntegral(image, buf.image_sqsums[0], stream); unsigned int templ_sum = (unsigned int)sum(templ)[0]; - unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0]; + unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ)[0]; matchTemplatePrepared_CCOFF_NORMED_8U( - templ.cols, templ.rows, image_sum, image_sqsum, + templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0], templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream)); } else { - vector images; - vector image_sums(image.channels()); - vector image_sqsums(image.channels()); - - split(image, images); + split(image, buf.images); + buf.image_sums.resize(buf.images.size()); + buf.image_sqsums.resize(buf.images.size()); for (int i = 0; i < image.channels(); ++i) { - integral(images[i], image_sums[i], stream); - sqrIntegral(images[i], image_sqsums[i], stream); + integral(buf.images[i], buf.image_sums[i], stream); + sqrIntegral(buf.images[i], buf.image_sqsums[i], stream); } Scalar templ_sum = sum(templ); @@ -367,34 +363,34 @@ namespace case 2: matchTemplatePrepared_CCOFF_NORMED_8UC2( templ.cols, templ.rows, - image_sums[0], image_sqsums[0], - image_sums[1], image_sqsums[1], - (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0], - (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1], + buf.image_sums[0], buf.image_sqsums[0], + buf.image_sums[1], buf.image_sqsums[1], + (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0], + (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1], result, StreamAccessor::getStream(stream)); break; case 3: matchTemplatePrepared_CCOFF_NORMED_8UC3( templ.cols, templ.rows, - image_sums[0], image_sqsums[0], - image_sums[1], image_sqsums[1], - image_sums[2], image_sqsums[2], - (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0], - (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1], - (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2], + buf.image_sums[0], buf.image_sqsums[0], + buf.image_sums[1], buf.image_sqsums[1], + buf.image_sums[2], buf.image_sqsums[2], + (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0], + (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1], + (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2], result, StreamAccessor::getStream(stream)); break; case 4: matchTemplatePrepared_CCOFF_NORMED_8UC4( templ.cols, templ.rows, - image_sums[0], image_sqsums[0], - image_sums[1], image_sqsums[1], - image_sums[2], image_sqsums[2], - image_sums[3], image_sqsums[3], - (unsigned int)templ_sum[0], (unsigned int)templ_sqsum[0], - (unsigned int)templ_sum[1], (unsigned int)templ_sqsum[1], - (unsigned int)templ_sum[2], (unsigned int)templ_sqsum[2], - (unsigned int)templ_sum[3], (unsigned int)templ_sqsum[3], + buf.image_sums[0], buf.image_sqsums[0], + buf.image_sums[1], buf.image_sqsums[1], + buf.image_sums[2], buf.image_sqsums[2], + buf.image_sums[3], buf.image_sqsums[3], + (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0], + (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1], + (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2], + (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3], result, StreamAccessor::getStream(stream)); break; default: @@ -406,16 +402,25 @@ namespace void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream) +{ + MatchTemplateBuf buf; + matchTemplate(image, templ, result, method, buf, stream); +} + + +void cv::gpu::matchTemplate( + const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, + MatchTemplateBuf &buf, Stream& stream) { CV_Assert(image.type() == templ.type()); CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows); - typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, Stream& stream); + typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream); - static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U, - ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U, + static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U, + ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U, ::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U }; - static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0, + static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0, ::matchTemplate_CCORR_32F, 0, 0, 0 }; const Caller* callers = 0; @@ -428,7 +433,7 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re Caller caller = callers[method]; CV_Assert(caller); - caller(image, templ, result, stream); + caller(image, templ, result, buf, stream); } #endif