new scheme of sqrSum

This commit is contained in:
Ilya Lavrenov
2014-02-24 18:38:33 +04:00
parent 0ef16125ae
commit 653b99c9bd
2 changed files with 55 additions and 36 deletions

View File

@@ -68,35 +68,47 @@ inline float normAcc_SQDIFF(float num, float denum)
#ifdef CALC_SUM #ifdef CALC_SUM
__kernel void calcSum(__global const uchar * templateptr, int template_step, int template_offset, __kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset,
int template_rows, int template_cols, __global float * result) int cols, int total, __global float * dst)
{ {
__global const T * template = (__global const T *)(templateptr + template_offset); int lid = get_local_id(0), id = get_global_id(0);
WT res = (WT)(0); __local WT localmem[WGS2_ALIGNED];
WT accumulator = (WT)(0), tmp;
for (int y = 0; y < template_rows; ++y) for ( ; id < total; id += WGS)
{ {
for (int x = 0; x < template_cols; ++x) int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(T), src_offset));
{ __global const T * src = (__global const T *)(srcptr + src_index);
WT value = convertToWT(template[x]);
#ifdef SUM_2 tmp = convertToWT(src[0]);
#if wdepth == 4 #if wdepth == 4
res = mad24(value, value, res); accumulator = mad24(tmp, tmp, accumulator);
#else #else
res = mad(value, value, res); accumulator = mad(tmp, tmp, accumulator);
#endif
#elif defined SUM_1
res += value;
#else
#error "No operation is specified"
#endif #endif
} }
template = (__global const T *)((__global const uchar *)template + template_step); if (lid < WGS2_ALIGNED)
localmem[lid] = accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)
localmem[lid - WGS2_ALIGNED] += accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)
{
if (lid < lsize)
{
int lid2 = lsize + lid;
localmem[lid] += localmem[lid2];
}
barrier(CLK_LOCAL_MEM_FENCE);
} }
result[0] = convertToDT(res); if (lid == 0)
dst[0] = convertToDT(localmem[0]);
} }
#elif defined CCORR #elif defined CCORR

View File

@@ -40,7 +40,6 @@
//M*/ //M*/
#include "precomp.hpp" #include "precomp.hpp"
#define CV_OPENCL_RUN_ASSERT
#include "opencl_kernels.hpp" #include "opencl_kernels.hpp"
////////////////////////////////////////////////// matchTemplate ////////////////////////////////////////////////////////// ////////////////////////////////////////////////// matchTemplate //////////////////////////////////////////////////////////
@@ -57,28 +56,36 @@ enum
SUM_1 = 0, SUM_2 = 1 SUM_1 = 0, SUM_2 = 1
}; };
static bool sumTemplate(InputArray _templ, UMat & result, int sum_type) static bool sumTemplate(InputArray _src, UMat & result)
{ {
CV_Assert(sum_type == SUM_1 || sum_type == SUM_2); int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
int type = _templ.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
int wdepth = std::max(CV_32S, depth), wtype = CV_MAKE_TYPE(wdepth, cn); int wdepth = std::max(CV_32S, depth), wtype = CV_MAKE_TYPE(wdepth, cn);
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs)
wgs2_aligned <<= 1;
wgs2_aligned >>= 1;
char cvt[40]; char cvt[40];
const char * const sumTypeToStr[] = { "SUM_1", "SUM_2" };
ocl::Kernel k("calcSum", ocl::imgproc::match_template_oclsrc, ocl::Kernel k("calcSum", ocl::imgproc::match_template_oclsrc,
format("-D CALC_SUM -D %s -D T=%s -D WT=%s -D convertToWT=%s -D cn=%d -D wdepth=%d", format("-D CALC_SUM -D T=%s -D WT=%s -D cn=%d -D convertToWT=%s -D WGS=%d -D WGS2_ALIGNED=%d -D wdepth=%d",
sumTypeToStr[sum_type], ocl::typeToStr(type), ocl::typeToStr(wtype), ocl::typeToStr(type), ocl::typeToStr(wtype), cn,
ocl::convertTypeStr(depth, wdepth, cn, cvt), cn, wdepth)); ocl::convertTypeStr(depth, wdepth, cn, cvt),
(int)wgs, wgs2_aligned, wdepth));
if (k.empty()) if (k.empty())
return false; return false;
UMat src = _src.getUMat();
result.create(1, 1, CV_32FC1); result.create(1, 1, CV_32FC1);
UMat templ = _templ.getUMat();
k.args(ocl::KernelArg::ReadOnly(templ), ocl::KernelArg::PtrWriteOnly(result)); ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
resarg = ocl::KernelArg::PtrWriteOnly(result);
return k.runTask(false); k.args(srcarg, src.cols, (int)src.total(), resarg);
size_t globalsize = wgs;
return k.run(1, &globalsize, &wgs, false);
} }
static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result) static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result)
@@ -123,7 +130,7 @@ static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, Out
integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F); integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
UMat templ_sqsum; UMat templ_sqsum;
if (!sumTemplate(templ, templ_sqsum, SUM_2)) if (!sumTemplate(templ, templ_sqsum))
return false; return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result), k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
@@ -177,7 +184,7 @@ static bool matchTemplate_SQDIFF_NORMED(InputArray _image, InputArray _templ, Ou
integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F); integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
UMat templ_sqsum; UMat templ_sqsum;
if (!sumTemplate(_templ, templ_sqsum, SUM_2)) if (!sumTemplate(_templ, templ_sqsum))
return false; return false;
k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result), k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),