diff --git a/modules/gpu/src/cuda/match_template.cu b/modules/gpu/src/cuda/match_template.cu index 575f16c41..4e388a5d7 100644 --- a/modules/gpu/src/cuda/match_template.cu +++ b/modules/gpu/src/cuda/match_template.cu @@ -42,7 +42,7 @@ #include #include "internal_shared.hpp" -#include "../opencv2/gpu/device/vecmath.hpp" +#include "opencv2/gpu/device/vecmath.hpp" using namespace cv::gpu; using namespace cv::gpu::device; @@ -386,10 +386,10 @@ __global__ void matchTemplatePreparedKernel_CCOFF_8U( if (x < result.cols && y < result.rows) { - float ccorr = result.ptr(y)[x]; float image_sum_ = (float)( (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); + float ccorr = result.ptr(y)[x]; result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale; } } @@ -407,6 +407,46 @@ void matchTemplatePrepared_CCOFF_8U( } +__global__ void matchTemplatePreparedKernel_CCOFF_8UC2( + int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, + const PtrStep_ image_sum_r, + const PtrStep_ image_sum_g, + DevMem2Df result) +{ + const int x = blockIdx.x * blockDim.x + threadIdx.x; + const int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x < result.cols && y < result.rows) + { + float image_sum_r_ = (float)( + (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - + (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); + float image_sum_g_ = (float)( + (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - + (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); + float ccorr = result.ptr(y)[x]; + result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r + - image_sum_g_ * templ_sum_scale_g; + } +} + + +void matchTemplatePrepared_CCOFF_8UC2( + int w, int h, + const DevMem2D_ image_sum_r, + const DevMem2D_ image_sum_g, + unsigned int templ_sum_r, unsigned int templ_sum_g, + DevMem2Df result) +{ + dim3 threads(32, 8); + dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); + matchTemplatePreparedKernel_CCOFF_8UC2<<>>( + w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), + image_sum_r, image_sum_g, result); + cudaSafeCall(cudaThreadSynchronize()); +} + + __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( int w, int h, float weight, float templ_sum_scale, float templ_sqsum_scale, diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index abe76a1a3..944ec096d 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -945,7 +945,7 @@ struct CV_GpuSumTest: CvTest int typemax = hasNativeDoubleSupport(getDevice()) ? CV_64F : CV_32F; for (int type = CV_8U; type <= typemax; ++type) { - gen(1 + rand() % 1000, 1 + rand() % 1000, type, src); + gen(1 + rand() % 500, 1 + rand() % 500, type, src); a = sum(src); b = sum(GpuMat(src)); if (abs(a[0] - b[0]) > src.size().area() * max_err)