more fix of mismatch

This commit is contained in:
yao 2013-03-26 14:10:29 +08:00
parent ad6aae4583
commit f36db3a037
3 changed files with 196 additions and 109 deletions

View File

@ -71,6 +71,9 @@ namespace cv
void matchTemplate_SQDIFF_NORMED( void matchTemplate_SQDIFF_NORMED(
const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
void convolve_32F(
const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
void matchTemplate_CCORR( void matchTemplate_CCORR(
const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf); const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf);
@ -90,41 +93,65 @@ namespace cv
void matchTemplateNaive_CCORR( void matchTemplateNaive_CCORR(
const oclMat &image, const oclMat &templ, oclMat &result, int cn); const oclMat &image, const oclMat &templ, oclMat &result, int cn);
void extractFirstChannel_32F(
const oclMat &image, oclMat &result);
// Evaluates optimal template's area threshold. If // Evaluates optimal template's area threshold. If
// template's area is less than the threshold, we use naive match // template's area is less than the threshold, we use naive match
// template version, otherwise FFT-based (if available) // template version, otherwise FFT-based (if available)
static int getTemplateThreshold(int method, int depth) static bool useNaive(int , int , Size )
{ {
switch (method) // FIXME!
{ // always use naive until convolve is imported
case CV_TM_CCORR: return true;
if (depth == CV_32F) return 250;
if (depth == CV_8U) return 300;
break;
case CV_TM_SQDIFF:
if (depth == CV_32F) return 0x7fffffff; // do naive SQDIFF for CV_32F
if (depth == CV_8U) return 300;
break;
}
CV_Error(CV_StsBadArg, "getTemplateThreshold: unsupported match template mode");
return 0;
} }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// SQDIFF // SQDIFF
void matchTemplate_SQDIFF( void matchTemplate_SQDIFF(
const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &) const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf & buf)
{ {
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) if (useNaive(CV_TM_SQDIFF, image.depth(), templ.size()))
{ {
matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels()); matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
return; return;
} }
else else
{ {
// TODO buf.image_sqsums.resize(1);
CV_Error(CV_StsBadArg, "Not supported yet for this size template");
// TODO, add double support for ocl::integral
// use CPU integral temporarily
Mat sums, sqsums;
cv::integral(Mat(image.reshape(1)), sums, sqsums);
buf.image_sqsums[0] = sqsums;
unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
matchTemplate_CCORR(image, templ, result, buf);
//port CUDA's matchTemplatePrepared_SQDIFF_8U
Context *clCxt = image.clCxt;
string kernelName = "matchTemplate_Prepared_SQDIFF";
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data));
args.push_back( make_pair( sizeof(cl_ulong), (void *)&templ_sqsum));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&templ.rows));
args.push_back( make_pair( sizeof(cl_int), (void *)&templ.cols));
args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {16, 16, 1};
const char * build_opt = image.oclchannels() == 4 ? "-D CN4" : "";
openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U, build_opt);
} }
} }
@ -134,7 +161,6 @@ namespace cv
matchTemplate_CCORR(image, templ, result, buf); matchTemplate_CCORR(image, templ, result, buf);
buf.image_sums.resize(1); buf.image_sums.resize(1);
integral(image.reshape(1), buf.image_sums[0]); integral(image.reshape(1), buf.image_sums[0]);
unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0]; unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
@ -156,7 +182,7 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
size_t globalThreads[3] = {result.cols, result.rows, 1}; size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {32, 8, 1}; size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
} }
@ -191,33 +217,39 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
size_t globalThreads[3] = {result.cols, result.rows, 1}; size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {32, 8, 1}; size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
} }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// CCORR // CCORR
void convolve_32F(
const oclMat &, const oclMat &, oclMat &, MatchTemplateBuf &)
{
CV_Error(-1, "convolve is not fully implemented yet");
}
void matchTemplate_CCORR( void matchTemplate_CCORR(
const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf) const oclMat &image, const oclMat &templ, oclMat &result, MatchTemplateBuf &buf)
{ {
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) if (useNaive(CV_TM_CCORR, image.depth(), templ.size()))
{ {
matchTemplateNaive_CCORR(image, templ, result, image.oclchannels()); matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
return; return;
} }
else else
{ {
CV_Error(CV_StsBadArg, "Not supported yet for this size template");
if(image.depth() == CV_8U && templ.depth() == CV_8U) if(image.depth() == CV_8U && templ.depth() == CV_8U)
{ {
image.convertTo(buf.imagef, CV_32F); image.convertTo(buf.imagef, CV_32F);
templ.convertTo(buf.templf, CV_32F); templ.convertTo(buf.templf, CV_32F);
convolve_32F(buf.imagef, buf.templf, result, buf);
}
else
{
convolve_32F(image, templ, result, buf);
} }
CV_Assert(image.oclchannels() == 1);
oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels()));
filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0));
result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1));
} }
} }
@ -249,7 +281,7 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
size_t globalThreads[3] = {result.cols, result.rows, 1}; size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {32, 8, 1}; size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U); openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, 1, CV_8U);
} }
@ -284,7 +316,7 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
size_t globalThreads[3] = {result.cols, result.rows, 1}; size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {32, 8, 1}; size_t localThreads[3] = {16, 16, 1};
openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
} }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
@ -301,7 +333,7 @@ namespace cv
kernelName = "matchTemplate_Prepared_CCOFF"; kernelName = "matchTemplate_Prepared_CCOFF";
size_t globalThreads[3] = {result.cols, result.rows, 1}; size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {32, 8, 1}; size_t localThreads[3] = {16, 16, 1};
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
@ -313,22 +345,22 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) ); args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
Vec4f templ_sum = Vec4f::all(0);
// to be continued in the following section // to be continued in the following section
if(image.oclchannels() == 1) if(image.oclchannels() == 1)
{ {
buf.image_sums.resize(1); buf.image_sums.resize(1);
integral(image, buf.image_sums[0]); integral(image, buf.image_sums[0]);
float templ_sum = 0; templ_sum[0] = (float)sum(templ)[0] / templ.size().area();
templ_sum = (float)sum(templ)[0] / templ.size().area();
args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) );
args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) );
args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].step) );
args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) ); args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
} }
else else
{ {
Vec4f templ_sum = Vec4f::all(0);
split(image, buf.images); split(image, buf.images);
templ_sum = sum(templ) / templ.size().area(); templ_sum = sum(templ) / templ.size().area();
buf.image_sums.resize(buf.images.size()); buf.image_sums.resize(buf.images.size());
@ -374,7 +406,7 @@ namespace cv
kernelName = "matchTemplate_Prepared_CCOFF_NORMED"; kernelName = "matchTemplate_Prepared_CCOFF_NORMED";
size_t globalThreads[3] = {result.cols, result.rows, 1}; size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {32, 8, 1}; size_t localThreads[3] = {16, 16, 1};
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) ); args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
@ -387,20 +419,22 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
args.push_back( make_pair( sizeof(cl_float), (void *)&scale) ); args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
Vec4f templ_sum = Vec4f::all(0);
Vec4f templ_sqsum = Vec4f::all(0);
// to be continued in the following section // to be continued in the following section
if(image.oclchannels() == 1) if(image.oclchannels() == 1)
{ {
buf.image_sums.resize(1); buf.image_sums.resize(1);
buf.image_sqsums.resize(1); buf.image_sqsums.resize(1);
integral(image, buf.image_sums[0], buf.image_sqsums[0]); integral(image, buf.image_sums[0], buf.image_sqsums[0]);
float templ_sum = 0;
float templ_sqsum = 0;
templ_sum = (float)sum(templ)[0];
templ_sqsum = sqrSum(templ)[0]; templ_sum[0] = (float)sum(templ)[0];
templ_sqsum -= scale * templ_sum * templ_sum; templ_sqsum[0] = sqrSum(templ)[0];
templ_sum *= scale;
templ_sqsum[0] -= scale * templ_sum[0] * templ_sum[0];
templ_sum[0] *= scale;
args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) ); args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sums[0].data) );
args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sums[0].offset) );
@ -408,13 +442,11 @@ namespace cv
args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) ); args.push_back( make_pair( sizeof(cl_mem), (void *)&buf.image_sqsums[0].data) );
args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].offset) );
args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) ); args.push_back( make_pair( sizeof(cl_int), (void *)&buf.image_sqsums[0].step) );
args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum) ); args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sum[0]) );
args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum) ); args.push_back( make_pair( sizeof(cl_float), (void *)&templ_sqsum[0]) );
} }
else else
{ {
Vec4f templ_sum = Vec4f::all(0);
Vec4f templ_sqsum = Vec4f::all(0);
split(image, buf.images); split(image, buf.images);
templ_sum = sum(templ); templ_sum = sum(templ);
@ -465,7 +497,27 @@ namespace cv
} }
openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth()); openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, image.oclchannels(), image.depth());
} }
void extractFirstChannel_32F(const oclMat &image, oclMat &result)
{
Context *clCxt = image.clCxt;
string kernelName;
kernelName = "extractFirstChannel";
size_t globalThreads[3] = {result.cols, result.rows, 1};
size_t localThreads[3] = {16, 16, 1};
vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&image.data) );
args.push_back( make_pair( sizeof(cl_mem), (void *)&result.data) );
args.push_back( make_pair( sizeof(cl_int), (void *)&result.rows) );
args.push_back( make_pair( sizeof(cl_int), (void *)&result.cols) );
args.push_back( make_pair( sizeof(cl_int), (void *)&image.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&image.step));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
openCLExecuteKernel(clCxt, &match_template, kernelName, globalThreads, localThreads, args, -1, -1);
}
}/*ocl*/ }/*ocl*/
} /*cv*/ } /*cv*/

View File

@ -45,22 +45,28 @@
#pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_amd_printf : enable
#if defined (__ATI__) #if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__) #ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif #endif
#if !defined(USE_SQR_INTEGRAL) && (defined (__ATI__) || defined (__NVIDIA__))
#define TYPE_IMAGE_SQSUM double #define TYPE_IMAGE_SQSUM double
#else #else
#define TYPE_IMAGE_SQSUM ulong #define TYPE_IMAGE_SQSUM float
#endif
#ifndef CN4
#define CN4 1
#else
#define CN4 4
#endif #endif
////////////////////////////////////////////////// //////////////////////////////////////////////////
// utilities // utilities
#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, gidx + img_sqsums_offset + ox) #define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4)
#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) #define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
// normAcc* are accurate normalization routines which make GPU matchTemplate // normAcc* are accurate normalization routines which make GPU matchTemplate
// consistent with CPU one // consistent with CPU one
@ -95,7 +101,7 @@ float normAcc_SQDIFF(float num, float denum)
__kernel __kernel
void normalizeKernel_C1_D0 void normalizeKernel_C1_D0
( (
__global const TYPE_IMAGE_SQSUM * img_sqsums, __global const float * img_sqsums,
__global float * res, __global float * res,
ulong tpl_sqsum, ulong tpl_sqsum,
int res_rows, int res_rows,
@ -119,8 +125,8 @@ void normalizeKernel_C1_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float image_sqsum_ = (float)( float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum)); res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
} }
} }
@ -152,8 +158,8 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float image_sqsum_ = (float)( float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum; res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
} }
} }
@ -161,7 +167,7 @@ void matchTemplate_Prepared_SQDIFF_C1_D0
__kernel __kernel
void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
( (
__global const TYPE_IMAGE_SQSUM * img_sqsums, __global const float * img_sqsums,
__global float * res, __global float * res,
ulong tpl_sqsum, ulong tpl_sqsum,
int res_rows, int res_rows,
@ -185,10 +191,10 @@ void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float image_sqsum_ = (float)( float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum, res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
sqrt(image_sqsum_ * tpl_sqsum)); sqrt(image_sqsum_ * tpl_sqsum));
} }
} }
@ -628,8 +634,8 @@ void matchTemplate_Prepared_CCOFF_C1_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float sum = (float)( float sum = (float)(
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
- (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
res[res_idx] -= sum * tpl_sum; res[res_idx] -= sum * tpl_sum;
} }
} }
@ -671,17 +677,17 @@ void matchTemplate_Prepared_CCOFF_C4_D0
{ {
float ccorr = res[res_idx]; float ccorr = res[res_idx];
ccorr -= tpl_sum_c0*(float)( ccorr -= tpl_sum_c0*(float)(
(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
ccorr -= tpl_sum_c1*(float)( ccorr -= tpl_sum_c1*(float)(
(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
ccorr -= tpl_sum_c2*(float)( ccorr -= tpl_sum_c2*(float)(
(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
ccorr -= tpl_sum_c3*(float)( ccorr -= tpl_sum_c3*(float)(
(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
res[res_idx] = ccorr; res[res_idx] = ccorr;
} }
} }
@ -702,7 +708,7 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
__global const uint * img_sums, __global const uint * img_sums,
int img_sums_offset, int img_sums_offset,
int img_sums_step, int img_sums_step,
__global const TYPE_IMAGE_SQSUM * img_sqsums, __global const float * img_sqsums,
int img_sqsums_offset, int img_sqsums_offset,
int img_sqsums_step, int img_sqsums_step,
float tpl_sum, float tpl_sum,
@ -725,12 +731,12 @@ void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float image_sum_ = (float)( float image_sum_ = (float)(
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
- (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
float image_sqsum_ = (float)( float image_sqsum_ = (float)(
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum, res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
} }
@ -754,10 +760,10 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
__global const uint * img_sums_c3, __global const uint * img_sums_c3,
int img_sums_offset, int img_sums_offset,
int img_sums_step, int img_sums_step,
__global const TYPE_IMAGE_SQSUM * img_sqsums_c0, __global const float * img_sqsums_c0,
__global const TYPE_IMAGE_SQSUM * img_sqsums_c1, __global const float * img_sqsums_c1,
__global const TYPE_IMAGE_SQSUM * img_sqsums_c2, __global const float * img_sqsums_c2,
__global const TYPE_IMAGE_SQSUM * img_sqsums_c3, __global const float * img_sqsums_c3,
int img_sqsums_offset, int img_sqsums_offset,
int img_sqsums_step, int img_sqsums_step,
float tpl_sum_c0, float tpl_sum_c0,
@ -782,42 +788,71 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
if(gidx < res_cols && gidy < res_rows) if(gidx < res_cols && gidy < res_rows)
{ {
float image_sum_c0 = (float)( float image_sum_c0 = (float)(
(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
float image_sum_c1 = (float)( float image_sum_c1 = (float)(
(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
float image_sum_c2 = (float)( float image_sum_c2 = (float)(
(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
float image_sum_c3 = (float)( float image_sum_c3 = (float)(
(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
- (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
float image_sqsum_c0 = (float)( float image_sqsum_c0 = (float)(
(img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
float image_sqsum_c1 = (float)( float image_sqsum_c1 = (float)(
(img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
float image_sqsum_c2 = (float)( float image_sqsum_c2 = (float)(
(img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
float image_sqsum_c3 = (float)( float image_sqsum_c3 = (float)(
(img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
(img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
float num = res[res_idx] - float num = res[res_idx] -
image_sum_c0 * tpl_sum_c0 - image_sum_c0 * tpl_sum_c0 -
image_sum_c1 * tpl_sum_c1 - image_sum_c1 * tpl_sum_c1 -
image_sum_c2 * tpl_sum_c2 - image_sum_c2 * tpl_sum_c2 -
image_sum_c3 * tpl_sum_c3; image_sum_c3 * tpl_sum_c3;
float denum = sqrt( tpl_sqsum * ( float denum = sqrt( tpl_sqsum * (
image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
); );
res[res_idx] = normAcc(num, denum); res[res_idx] = normAcc(num, denum);
} }
} }
//////////////////////////////////////////////////////////////////////
// extractFirstChannel
__kernel
void extractFirstChannel
(
const __global float4* img,
__global float* res,
int rows,
int cols,
int img_offset,
int res_offset,
int img_step,
int res_step
)
{
img_step /= sizeof(float4);
res_step /= sizeof(float);
img_offset /= sizeof(float4);
res_offset /= sizeof(float);
img += img_offset;
res += res_offset;
int gidx = get_global_id(0);
int gidy = get_global_id(1);
if(gidx < cols && gidy < rows)
{
res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x;
}
}

View File

@ -75,7 +75,7 @@ PARAM_TEST_CASE(MatchTemplate8U, cv::Size, TemplateSize, Channels, TemplateMetho
} }
}; };
TEST_P(MatchTemplate8U, DISABLED_Accuracy) TEST_P(MatchTemplate8U, Accuracy)
{ {
std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl; std::cout << "Method: " << TEMPLATE_METHOD_NAMES[method] << std::endl;
@ -138,18 +138,18 @@ TEST_P(MatchTemplate32F, Accuracy)
EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss); EXPECT_MAT_NEAR(dst_gold, mat_dst, templ_size.area() * 1e-1, sss);
} }
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate8U,
testing::Combine( testing::Combine(
MTEMP_SIZES, MTEMP_SIZES,
testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
testing::Values(Channels(1), Channels(3), Channels(4)), testing::Values(Channels(1), Channels(3), Channels(4)),
ALL_TEMPLATE_METHODS ALL_TEMPLATE_METHODS
) )
); );
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine( INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MatchTemplate32F, testing::Combine(
MTEMP_SIZES, MTEMP_SIZES,
testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16)), TemplateSize(cv::Size(30, 30))),
testing::Values(Channels(1), Channels(3), Channels(4)), testing::Values(Channels(1), Channels(3), Channels(4)),
testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))); testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
#endif #endif