From 40304721a775023a13278fcb757e565c333d4774 Mon Sep 17 00:00:00 2001 From: Alexey Spizhevoy Date: Wed, 8 Dec 2010 13:03:53 +0000 Subject: [PATCH] added support of CV_TM_CCORR (via FFT) into gpu::matchTemplate (versions both with block and without blocks) --- modules/gpu/CMakeLists.txt | 1 + modules/gpu/src/cuda/match_template.cu | 31 +++- modules/gpu/src/match_template.cpp | 211 +++++++++++++++++++++++-- tests/gpu/src/match_template.cpp | 102 ++++++++---- 4 files changed, 301 insertions(+), 44 deletions(-) diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index 239b06aeb..77228e5a8 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -112,6 +112,7 @@ target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS}) if (HAVE_CUDA) target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES}) + CUDA_ADD_CUFFT_TO_TARGET(${the_target}) endif() if(MSVC) diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index 42c895125..7eda2c92c 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -40,8 +40,12 @@ // //M*/ +#include #include "internal_shared.hpp" +#include +using namespace std; + using namespace cv::gpu; namespace cv { namespace gpu { namespace imgproc { @@ -50,7 +54,7 @@ texture imageTex_8U; texture templTex_8U; -__global__ void matchTemplateKernel_8U_SqDiff(int w, int h, DevMem2Df result) +__global__ void matchTemplateKernel_8U_SQDIFF(int w, int h, DevMem2Df result) { int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; @@ -75,7 +79,7 @@ __global__ void matchTemplateKernel_8U_SqDiff(int w, int h, DevMem2Df result) } -void matchTemplateCaller_8U_SqDiff(const DevMem2D image, const DevMem2D templ, DevMem2Df result) +void matchTemplate_8U_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2Df result) { dim3 threads(32, 8); dim3 grid(divUp(image.cols - templ.cols + 1, threads.x), @@ -87,11 +91,32 @@ void matchTemplateCaller_8U_SqDiff(const DevMem2D image, const DevMem2D templ, D imageTex_8U.filterMode = cudaFilterModePoint; templTex_8U.filterMode = cudaFilterModePoint; - matchTemplateKernel_8U_SqDiff<<>>(templ.cols, templ.rows, result); + matchTemplateKernel_8U_SQDIFF<<>>(templ.cols, templ.rows, result); cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaUnbindTexture(imageTex_8U)); cudaSafeCall(cudaUnbindTexture(templTex_8U)); } +__global__ void multiplyAndNormalizeSpectsKernel(int n, float scale, const cufftComplex* a, + const cufftComplex* b, cufftComplex* c) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + if (x < n) + { + cufftComplex v = cuCmulf(a[x], cuConjf(b[x])); + c[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale); + } +} + + +void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, const cufftComplex* b, + cufftComplex* c) +{ + dim3 threads(256); + dim3 grid(divUp(n, threads.x)); + multiplyAndNormalizeSpectsKernel<<>>(n, scale, a, b, c); +} + + }}} diff --git a/modules/gpu/src/match_template.cpp b/modules/gpu/src/match_template.cpp index db182bd0e..6713f7ba5 100644 --- a/modules/gpu/src/match_template.cpp +++ b/modules/gpu/src/match_template.cpp @@ -41,6 +41,14 @@ //M*/ #include "precomp.hpp" +#include +#include +#include + +using namespace cv; +using namespace cv::gpu; + +#define BLOCK_VERSION #if !defined (HAVE_CUDA) @@ -48,22 +56,207 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_ #else -namespace cv { namespace gpu { namespace imgproc { - - void matchTemplateCaller_8U_SqDiff(const DevMem2D, const DevMem2D, DevMem2Df); - +namespace cv { namespace gpu { namespace imgproc +{ + void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, + const cufftComplex* b, cufftComplex* c); + void matchTemplate_8U_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2Df result); }}} + +namespace +{ + + template + void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result); + + +#ifdef BLOCK_VERSION + void estimateBlockSize(int w, int h, int tw, int th, int& bw, int& bh) + { + const int scale = 40; + const int bh_min = 1024; + const int bw_min = 1024; + bw = std::max(tw * scale, bw_min); + bh = std::max(th * scale, bh_min); + bw = std::min(bw, w); + bh = std::min(bh, h); + } +#endif + + + template <> + void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result) + { + result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); + imgproc::matchTemplate_8U_SQDIFF(image, templ, result); + } + + +#ifdef BLOCK_VERSION + template <> + void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result) + { + result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); + + Size block_size; + estimateBlockSize(result.cols, result.rows, templ.cols, templ.rows, + block_size.width, block_size.height); + + Size dft_size; + dft_size.width = getOptimalDFTSize(block_size.width + templ.cols - 1); + dft_size.height = getOptimalDFTSize(block_size.width + templ.rows - 1); + + block_size.width = std::min(dft_size.width - templ.cols + 1, result.cols); + block_size.height = std::min(dft_size.height - templ.rows + 1, result.rows); + + cufftReal* image_data; + cufftReal* templ_data; + cufftReal* result_data; + cudaMalloc((void**)&image_data, sizeof(cufftReal) * dft_size.area()); + cudaMalloc((void**)&templ_data, sizeof(cufftReal) * dft_size.area()); + cudaMalloc((void**)&result_data, sizeof(cufftReal) * dft_size.area()); + + int spect_len = dft_size.height * (dft_size.width / 2 + 1); + cufftComplex* image_spect; + cufftComplex* templ_spect; + cufftComplex* result_spect; + cudaMalloc((void**)&image_spect, sizeof(cufftComplex) * spect_len); + cudaMalloc((void**)&templ_spect, sizeof(cufftComplex) * spect_len); + cudaMalloc((void**)&result_spect, sizeof(cufftComplex) * spect_len); + + cufftHandle planR2C, planC2R; + CV_Assert(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R) == CUFFT_SUCCESS); + CV_Assert(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C) == CUFFT_SUCCESS); + + GpuMat templ_roi(templ.size(), CV_32S, templ.data, templ.step); + GpuMat templ_block(dft_size, CV_32S, templ_data, dft_size.width * sizeof(cufftReal)); + copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, + templ_block.cols - templ_roi.cols, 0); + CV_Assert(cufftExecR2C(planR2C, templ_data, templ_spect) == CUFFT_SUCCESS); + + GpuMat image_block(dft_size, CV_32S, image_data, dft_size.width * sizeof(cufftReal)); + + for (int y = 0; y < result.rows; y += block_size.height) + { + for (int x = 0; x < result.cols; x += block_size.width) + { + Size image_roi_size; + image_roi_size.width = min(x + dft_size.width, image.cols) - x; + image_roi_size.height = min(y + dft_size.height, image.rows) - y; + GpuMat image_roi(image_roi_size, CV_32S, (void*)(image.ptr(y) + x), image.step); + copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows, 0, + image_block.cols - image_roi.cols, 0); + + CV_Assert(cufftExecR2C(planR2C, image_data, image_spect) == CUFFT_SUCCESS); + imgproc::multiplyAndNormalizeSpects(spect_len, 1.f / dft_size.area(), + image_spect, templ_spect, result_spect); + CV_Assert(cufftExecC2R(planC2R, result_spect, result_data) == CUFFT_SUCCESS); + + Size result_roi_size; + result_roi_size.width = min(x + block_size.width, result.cols) - x; + result_roi_size.height = min(y + block_size.height, result.rows) - y; + GpuMat result_roi(result_roi_size, CV_32F, (void*)(result.ptr(y) + x), result.step); + GpuMat result_block(result_roi_size, CV_32F, result_data, dft_size.width * sizeof(cufftReal)); + result_block.copyTo(result_roi); + } + } + + cufftDestroy(planR2C); + cufftDestroy(planC2R); + + cudaFree(image_spect); + cudaFree(templ_spect); + cudaFree(result_spect); + cudaFree(image_data); + cudaFree(templ_data); + cudaFree(result_data); + } +#else + template <> + void matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result) + { + Size opt_size; + opt_size.width = getOptimalDFTSize(image.cols); + opt_size.height = getOptimalDFTSize(image.rows); + + cufftReal* image_data; + cufftReal* templ_data; + cufftReal* result_data; + cudaMalloc((void**)&image_data, sizeof(cufftReal) * opt_size.area()); + cudaMalloc((void**)&templ_data, sizeof(cufftReal) * opt_size.area()); + cudaMalloc((void**)&result_data, sizeof(cufftReal) * opt_size.area()); + + int spect_len = opt_size.height * (opt_size.width / 2 + 1); + cufftComplex* image_spect; + cufftComplex* templ_spect; + cufftComplex* result_spect; + cudaMalloc((void**)&image_spect, sizeof(cufftComplex) * spect_len); + cudaMalloc((void**)&templ_spect, sizeof(cufftComplex) * spect_len); + cudaMalloc((void**)&result_spect, sizeof(cufftComplex) * spect_len); + + GpuMat image_(image.size(), CV_32S, image.data, image.step); + GpuMat image_cont(opt_size, CV_32S, image_data, opt_size.width * sizeof(cufftReal)); + copyMakeBorder(image_, image_cont, 0, image_cont.rows - image.rows, 0, + image_cont.cols - image.cols, 0); + + GpuMat templ_(templ.size(), CV_32S, templ.data, templ.step); + GpuMat templ_cont(opt_size, CV_32S, templ_data, opt_size.width * sizeof(cufftReal)); + copyMakeBorder(templ_, templ_cont, 0, templ_cont.rows - templ.rows, 0, + templ_cont.cols - templ.cols, 0); + + cufftHandle planR2C, planC2R; + CV_Assert(cufftPlan2d(&planC2R, opt_size.height, opt_size.width, CUFFT_C2R) == CUFFT_SUCCESS); + CV_Assert(cufftPlan2d(&planR2C, opt_size.height, opt_size.width, CUFFT_R2C) == CUFFT_SUCCESS); + + CV_Assert(cufftExecR2C(planR2C, image_data, image_spect) == CUFFT_SUCCESS); + CV_Assert(cufftExecR2C(planR2C, templ_data, templ_spect) == CUFFT_SUCCESS); + imgproc::multiplyAndNormalizeSpects(spect_len, 1.f / opt_size.area(), + image_spect, templ_spect, result_spect); + + CV_Assert(cufftExecC2R(planC2R, result_spect, result_data) == CUFFT_SUCCESS); + + cufftDestroy(planR2C); + cufftDestroy(planC2R); + + GpuMat result_cont(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F, + result_data, opt_size.width * sizeof(cufftReal)); + result_cont.copyTo(result); + + cudaFree(image_spect); + cudaFree(templ_spect); + cudaFree(result_spect); + cudaFree(image_data); + cudaFree(templ_data); + cudaFree(result_data); + } +#endif + +} + + void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method) { - CV_Assert(image.type() == CV_8U); - CV_Assert(method == CV_TM_SQDIFF); - CV_Assert(image.type() == templ.type()); CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows); - result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); - imgproc::matchTemplateCaller_8U_SqDiff(image, templ, result); + typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&); + + static const Caller callers8U[] = { ::matchTemplate, 0, 0, 0, 0, 0 }; + static const Caller callers32F[] = { 0, 0, ::matchTemplate, 0, 0, 0 }; + + const Caller* callers; + switch (image.type()) + { + case CV_8U: callers = callers8U; break; + case CV_32F: callers = callers32F; break; + default: CV_Error(CV_StsBadArg, "matchTemplate: unsupported data type"); + } + + Caller caller = callers[method]; + CV_Assert(caller); + caller(image, templ, result); } #endif + diff --git a/tests/gpu/src/match_template.cpp b/tests/gpu/src/match_template.cpp index c80cb29a2..599f501f5 100644 --- a/tests/gpu/src/match_template.cpp +++ b/tests/gpu/src/match_template.cpp @@ -41,6 +41,17 @@ //M*/ #include "gputest.hpp" +#include +#include + +//#define SHOW_TIME + +#ifdef SHOW_TIME +#include +#define F(x) +#else +#define F(x) +#endif using namespace cv; using namespace std; @@ -57,40 +68,75 @@ struct CV_GpuMatchTemplateTest: CvTest Mat dst_gold; gpu::GpuMat dst; int n, m, h, w; + F(clock_t t;) - for (int i = 0; i < 4; ++i) + for (int i = 0; i < 3; ++i) { - n = 1 + rand() % 100; - m = 1 + rand() % 100; - do h = 1 + rand() % 20; while (h > n); - do w = 1 + rand() % 20; while (w > m); + n = 1 + rand() % 2000; + m = 1 + rand() % 1000; + do h = 1 + rand() % 30; while (h > n); + do w = 1 + rand() % 30; while (w > m); + gen(image, n, m, CV_8U); gen(templ, h, w, CV_8U); - - match_template_naive(image, templ, dst_gold); + F(t = clock();) + matchTemplate(image, templ, dst_gold, CV_TM_SQDIFF); + F(cout << "cpu:" << clock() - t << endl;) + F(t = clock();) gpu::matchTemplate(gpu::GpuMat(image), gpu::GpuMat(templ), dst, CV_TM_SQDIFF); - if (!check8U(dst_gold, Mat(dst))) return; + F(cout << "gpu_block: " << clock() - t << endl;) + if (!check(dst_gold, Mat(dst), 5 * h * w * 1e-5f)) return; + + gen(image, n, m, CV_32F); + gen(templ, h, w, CV_32F); + F(t = clock();) + matchTemplate(image, templ, dst_gold, CV_TM_CCORR); + F(cout << "cpu:" << clock() - t << endl;) + F(t = clock();) + gpu::matchTemplate(gpu::GpuMat(image), gpu::GpuMat(templ), dst, CV_TM_CCORR); + F(cout << "gpu_block: " << clock() - t << endl;) + if (!check(dst_gold, Mat(dst), 0.25f * h * w * 1e-5f)) return; } } catch (const Exception& e) { + ts->printf(CvTS::CONSOLE, e.what()); if (!check_and_treat_gpu_exception(e, ts)) throw; return; } } - void gen(Mat& a, int rows, int cols, int type) { RNG rng; a.create(rows, cols, type); if (type == CV_8U) rng.fill(a, RNG::UNIFORM, Scalar(0), Scalar(10)); + else if (type == CV_32F) + rng.fill(a, RNG::UNIFORM, Scalar(0.f), Scalar(1.f)); } - // Naive version for unsigned char - // Time complexity is O(a.size().area() * b.size().area()). - void match_template_naive(const Mat& a, const Mat& b, Mat& c) + bool check(const Mat& a, const Mat& b, float max_err) + { + if (a.size() != b.size()) + { + ts->printf(CvTS::CONSOLE, "bad size"); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return false; + } + + float err = (float)norm(a, b, NORM_INF); + if (err > max_err) + { + ts->printf(CvTS::CONSOLE, "bad accuracy: %f\n", err); + ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); + return false; + } + + return true; + } + + void match_template_naive_SQDIFF(const Mat& a, const Mat& b, Mat& c) { c.create(a.rows - b.rows + 1, a.cols - b.cols + 1, CV_32F); for (int i = 0; i < c.rows; ++i) @@ -114,32 +160,24 @@ struct CV_GpuMatchTemplateTest: CvTest } } - - bool check8U(const Mat& a, const Mat& b) + void match_template_naive_CCORR(const Mat& a, const Mat& b, Mat& c) { - if (a.size() != b.size()) + c.create(a.rows - b.rows + 1, a.cols - b.cols + 1, CV_32F); + for (int i = 0; i < c.rows; ++i) { - ts->printf(CvTS::CONSOLE, "bad size"); - ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); - return false; - } - - for (int i = 0; i < a.rows; ++i) - { - for (int j = 0; j < a.cols; ++j) + for (int j = 0; j < c.cols; ++j) { - float v1 = a.at(i, j); - float v2 = b.at(i, j); - if (fabs(v1 - v2) > 1e-3f) + float sum = 0.f; + for (int y = 0; y < b.rows; ++y) { - ts->printf(CvTS::CONSOLE, "(gold)%f != %f, pos: (%d, %d) size: (%d, %d)\n", - v1, v2, j, i, a.cols, a.rows); - ts->set_failed_test_info(CvTS::FAIL_INVALID_OUTPUT); - return false; + const float* arow = a.ptr(i + y); + const float* brow = b.ptr(y); + for (int x = 0; x < b.cols; ++x) + sum += arow[j + x] * brow[x]; } + c.at(i, j) = sum; } } - - return true; } } match_template_test; +