new scheme of sqrSum
This commit is contained in:
parent
0ef16125ae
commit
653b99c9bd
modules/imgproc/src
@ -68,35 +68,47 @@ inline float normAcc_SQDIFF(float num, float denum)
|
|||||||
|
|
||||||
#ifdef CALC_SUM
|
#ifdef CALC_SUM
|
||||||
|
|
||||||
__kernel void calcSum(__global const uchar * templateptr, int template_step, int template_offset,
|
__kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset,
|
||||||
int template_rows, int template_cols, __global float * result)
|
int cols, int total, __global float * dst)
|
||||||
{
|
{
|
||||||
__global const T * template = (__global const T *)(templateptr + template_offset);
|
int lid = get_local_id(0), id = get_global_id(0);
|
||||||
|
|
||||||
WT res = (WT)(0);
|
__local WT localmem[WGS2_ALIGNED];
|
||||||
|
WT accumulator = (WT)(0), tmp;
|
||||||
|
|
||||||
for (int y = 0; y < template_rows; ++y)
|
for ( ; id < total; id += WGS)
|
||||||
{
|
{
|
||||||
for (int x = 0; x < template_cols; ++x)
|
int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(T), src_offset));
|
||||||
{
|
__global const T * src = (__global const T *)(srcptr + src_index);
|
||||||
WT value = convertToWT(template[x]);
|
|
||||||
#ifdef SUM_2
|
|
||||||
#if wdepth == 4
|
|
||||||
res = mad24(value, value, res);
|
|
||||||
#else
|
|
||||||
res = mad(value, value, res);
|
|
||||||
#endif
|
|
||||||
#elif defined SUM_1
|
|
||||||
res += value;
|
|
||||||
#else
|
|
||||||
#error "No operation is specified"
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template = (__global const T *)((__global const uchar *)template + template_step);
|
tmp = convertToWT(src[0]);
|
||||||
|
#if wdepth == 4
|
||||||
|
accumulator = mad24(tmp, tmp, accumulator);
|
||||||
|
#else
|
||||||
|
accumulator = mad(tmp, tmp, accumulator);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
result[0] = convertToDT(res);
|
if (lid < WGS2_ALIGNED)
|
||||||
|
localmem[lid] = accumulator;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)
|
||||||
|
localmem[lid - WGS2_ALIGNED] += accumulator;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)
|
||||||
|
{
|
||||||
|
if (lid < lsize)
|
||||||
|
{
|
||||||
|
int lid2 = lsize + lid;
|
||||||
|
localmem[lid] += localmem[lid2];
|
||||||
|
}
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lid == 0)
|
||||||
|
dst[0] = convertToDT(localmem[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined CCORR
|
#elif defined CCORR
|
||||||
|
@ -40,7 +40,6 @@
|
|||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
#include "precomp.hpp"
|
#include "precomp.hpp"
|
||||||
#define CV_OPENCL_RUN_ASSERT
|
|
||||||
#include "opencl_kernels.hpp"
|
#include "opencl_kernels.hpp"
|
||||||
|
|
||||||
////////////////////////////////////////////////// matchTemplate //////////////////////////////////////////////////////////
|
////////////////////////////////////////////////// matchTemplate //////////////////////////////////////////////////////////
|
||||||
@ -57,28 +56,36 @@ enum
|
|||||||
SUM_1 = 0, SUM_2 = 1
|
SUM_1 = 0, SUM_2 = 1
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool sumTemplate(InputArray _templ, UMat & result, int sum_type)
|
static bool sumTemplate(InputArray _src, UMat & result)
|
||||||
{
|
{
|
||||||
CV_Assert(sum_type == SUM_1 || sum_type == SUM_2);
|
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
||||||
|
|
||||||
int type = _templ.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
|
|
||||||
int wdepth = std::max(CV_32S, depth), wtype = CV_MAKE_TYPE(wdepth, cn);
|
int wdepth = std::max(CV_32S, depth), wtype = CV_MAKE_TYPE(wdepth, cn);
|
||||||
|
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
|
||||||
|
|
||||||
|
int wgs2_aligned = 1;
|
||||||
|
while (wgs2_aligned < (int)wgs)
|
||||||
|
wgs2_aligned <<= 1;
|
||||||
|
wgs2_aligned >>= 1;
|
||||||
|
|
||||||
char cvt[40];
|
char cvt[40];
|
||||||
const char * const sumTypeToStr[] = { "SUM_1", "SUM_2" };
|
|
||||||
ocl::Kernel k("calcSum", ocl::imgproc::match_template_oclsrc,
|
ocl::Kernel k("calcSum", ocl::imgproc::match_template_oclsrc,
|
||||||
format("-D CALC_SUM -D %s -D T=%s -D WT=%s -D convertToWT=%s -D cn=%d -D wdepth=%d",
|
format("-D CALC_SUM -D T=%s -D WT=%s -D cn=%d -D convertToWT=%s -D WGS=%d -D WGS2_ALIGNED=%d -D wdepth=%d",
|
||||||
sumTypeToStr[sum_type], ocl::typeToStr(type), ocl::typeToStr(wtype),
|
ocl::typeToStr(type), ocl::typeToStr(wtype), cn,
|
||||||
ocl::convertTypeStr(depth, wdepth, cn, cvt), cn, wdepth));
|
ocl::convertTypeStr(depth, wdepth, cn, cvt),
|
||||||
|
(int)wgs, wgs2_aligned, wdepth));
|
||||||
if (k.empty())
|
if (k.empty())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
UMat src = _src.getUMat();
|
||||||
result.create(1, 1, CV_32FC1);
|
result.create(1, 1, CV_32FC1);
|
||||||
UMat templ = _templ.getUMat();
|
|
||||||
|
|
||||||
k.args(ocl::KernelArg::ReadOnly(templ), ocl::KernelArg::PtrWriteOnly(result));
|
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
|
||||||
|
resarg = ocl::KernelArg::PtrWriteOnly(result);
|
||||||
|
|
||||||
return k.runTask(false);
|
k.args(srcarg, src.cols, (int)src.total(), resarg);
|
||||||
|
|
||||||
|
size_t globalsize = wgs;
|
||||||
|
return k.run(1, &globalsize, &wgs, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result)
|
static bool matchTemplateNaive_CCORR(InputArray _image, InputArray _templ, OutputArray _result)
|
||||||
@ -123,7 +130,7 @@ static bool matchTemplate_CCORR_NORMED(InputArray _image, InputArray _templ, Out
|
|||||||
integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
|
integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
|
||||||
|
|
||||||
UMat templ_sqsum;
|
UMat templ_sqsum;
|
||||||
if (!sumTemplate(templ, templ_sqsum, SUM_2))
|
if (!sumTemplate(templ, templ_sqsum))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
|
k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
|
||||||
@ -177,7 +184,7 @@ static bool matchTemplate_SQDIFF_NORMED(InputArray _image, InputArray _templ, Ou
|
|||||||
integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
|
integral(image.reshape(1), image_sums, image_sqsums, CV_32F, CV_32F);
|
||||||
|
|
||||||
UMat templ_sqsum;
|
UMat templ_sqsum;
|
||||||
if (!sumTemplate(_templ, templ_sqsum, SUM_2))
|
if (!sumTemplate(_templ, templ_sqsum))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
|
k.args(ocl::KernelArg::ReadOnlyNoSize(image_sqsums), ocl::KernelArg::ReadWrite(result),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user