Merge branch '2.4'
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
//
|
||||
// @Authors
|
||||
// Nathan, liujun@multicorewareinc.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
@@ -60,11 +61,21 @@ namespace cv
|
||||
}
|
||||
}
|
||||
|
||||
static const int OPT_SIZE = 100;
|
||||
|
||||
static const char * T_ARR [] = {
|
||||
"uchar",
|
||||
"char",
|
||||
"ushort",
|
||||
"short",
|
||||
"int",
|
||||
"float -D T_FLOAT",
|
||||
"double"};
|
||||
|
||||
template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
|
||||
void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
const oclMat &trainIdx, const oclMat &distance, int distType)
|
||||
{
|
||||
CV_Assert(query.type() == CV_32F);
|
||||
cv::ocl::Context *ctx = query.clCxt;
|
||||
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||
size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||
@@ -73,6 +84,11 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
|
||||
int m_size = MAX_DESC_LEN;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
|
||||
T_ARR[query.depth()], distType, block_size, m_size);
|
||||
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -81,18 +97,15 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( std::make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&m_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_UnrollMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -106,7 +119,6 @@ template < int BLOCK_SIZE/*, typename Mask*/ >
|
||||
void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
const oclMat &trainIdx, const oclMat &distance, int distType)
|
||||
{
|
||||
CV_Assert(query.type() == CV_32F);
|
||||
cv::ocl::Context *ctx = query.clCxt;
|
||||
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||
size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||
@@ -114,6 +126,10 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
int block_size = BLOCK_SIZE;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
|
||||
T_ARR[query.depth()], distType, block_size);
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -122,17 +138,15 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( std::make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_Match";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,7 +161,6 @@ template < int BLOCK_SIZE, int MAX_DESC_LEN/*, typename Mask*/ >
|
||||
void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
|
||||
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
|
||||
{
|
||||
CV_Assert(query.type() == CV_32F);
|
||||
cv::ocl::Context *ctx = query.clCxt;
|
||||
size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
|
||||
size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||
@@ -156,6 +169,11 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
||||
int m_size = MAX_DESC_LEN;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
|
||||
T_ARR[query.depth()], distType, block_size, m_size);
|
||||
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -166,8 +184,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
||||
args.push_back( std::make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&m_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@@ -175,11 +191,10 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_RadiusUnrollMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -188,7 +203,6 @@ template < int BLOCK_SIZE/*, typename Mask*/ >
|
||||
void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
|
||||
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
|
||||
{
|
||||
CV_Assert(query.type() == CV_32F);
|
||||
cv::ocl::Context *ctx = query.clCxt;
|
||||
size_t globalSize[] = {(train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, (query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, 1};
|
||||
size_t localSize[] = {BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||
@@ -196,6 +210,11 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
|
||||
int block_size = BLOCK_SIZE;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
|
||||
T_ARR[query.depth()], distType, block_size);
|
||||
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -206,7 +225,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
||||
args.push_back( std::make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@@ -214,11 +232,10 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&trainIdx.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_RadiusMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -293,6 +310,11 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
|
||||
int m_size = MAX_DESC_LEN;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
|
||||
T_ARR[query.depth()], distType, block_size, m_size);
|
||||
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -301,18 +323,15 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( std::make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&m_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_knnUnrollMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -327,6 +346,11 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
int block_size = BLOCK_SIZE;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
|
||||
T_ARR[query.depth()], distType, block_size);
|
||||
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -335,17 +359,15 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( std::make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_knnMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -360,6 +382,11 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
|
||||
int m_size = MAX_DESC_LEN;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d -D MAX_DESC_LEN=%d",
|
||||
T_ARR[query.depth()], distType, block_size, m_size);
|
||||
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -374,11 +401,10 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_calcDistanceUnrolled";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -392,6 +418,11 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask
|
||||
int block_size = BLOCK_SIZE;
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt,
|
||||
"-D T=%s -D DIST_TYPE=%d -D BLOCK_SIZE=%d",
|
||||
T_ARR[query.depth()], distType, block_size);
|
||||
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@@ -405,11 +436,10 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void *)&distType ));
|
||||
|
||||
String kernelName = "BruteForceMatch_calcDistance";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1, opt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -471,7 +501,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
|
||||
//args.push_back( std::make_pair( sizeof(cl_int), (void *)&train.cols ));
|
||||
//args.push_back( std::make_pair( sizeof(cl_int), (void *)&query.step ));
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, trainIdx.depth(), -1);
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -531,24 +561,15 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchSingle(const oclMat &query, const
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
|
||||
int callType = query.depth();
|
||||
if (callType != 5)
|
||||
CV_Error(Error::StsUnsupportedFormat, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
|
||||
if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
|
||||
|| callType != 2 || callType != 4)))
|
||||
{
|
||||
CV_Error(Error::BadDepth, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
}
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
CV_Assert(train.cols == query.cols && train.type() == query.type());
|
||||
|
||||
trainIdx.create(1, query.rows, CV_32S);
|
||||
distance.create(1, query.rows, CV_32F);
|
||||
ensureSizeIsEnough(1, query.rows, CV_32S, trainIdx);
|
||||
ensureSizeIsEnough(1, query.rows, CV_32F, distance);
|
||||
|
||||
matchDispatcher(query, train, mask, trainIdx, distance, distType);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector<DMatch> &matches)
|
||||
@@ -594,7 +615,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons
|
||||
|
||||
void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, std::vector<DMatch> &matches, const oclMat &mask)
|
||||
{
|
||||
CV_Assert(mask.empty()); // mask is not supported at the moment
|
||||
assert(mask.empty()); // mask is not supported at the moment
|
||||
oclMat trainIdx, distance;
|
||||
matchSingle(query, train, trainIdx, distance, mask);
|
||||
matchDownload(trainIdx, distance, matches);
|
||||
@@ -650,24 +671,17 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchCollection(const oclMat &query, c
|
||||
if (query.empty() || trainCollection.empty())
|
||||
return;
|
||||
|
||||
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
|
||||
int callType = query.depth();
|
||||
if (callType != 5)
|
||||
CV_Error(Error::StsUnsupportedFormat, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
|
||||
if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
|
||||
|| callType != 2 || callType != 4)))
|
||||
{
|
||||
CV_Error(Error::BadDepth, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
}
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
|
||||
trainIdx.create(1, query.rows, CV_32S);
|
||||
imgIdx.create(1, query.rows, CV_32S);
|
||||
distance.create(1, query.rows, CV_32F);
|
||||
const int nQuery = query.rows;
|
||||
|
||||
ensureSizeIsEnough(1, nQuery, CV_32S, trainIdx);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32S, imgIdx);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32F, distance);
|
||||
|
||||
matchDispatcher(query, (const oclMat *)trainCollection.ptr(), trainCollection.cols, masks, trainIdx, imgIdx, distance, distType);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void cv::ocl::BruteForceMatcher_OCL_base::matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector<DMatch> &matches)
|
||||
@@ -736,36 +750,29 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatchSingle(const oclMat &query, co
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
|
||||
int callType = query.depth();
|
||||
|
||||
if (callType != 5)
|
||||
CV_Error(Error::StsUnsupportedFormat, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
|
||||
if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
|
||||
|| callType != 2 || callType != 4)))
|
||||
{
|
||||
CV_Error(Error::BadDepth, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
}
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
CV_Assert(train.type() == query.type() && train.cols == query.cols);
|
||||
|
||||
const int nQuery = query.rows;
|
||||
const int nTrain = train.rows;
|
||||
|
||||
if (k == 2)
|
||||
{
|
||||
trainIdx.create(1, query.rows, CV_32SC2);
|
||||
distance.create(1, query.rows, CV_32FC2);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
|
||||
}
|
||||
else
|
||||
{
|
||||
trainIdx.create(query.rows, k, CV_32S);
|
||||
distance.create(query.rows, k, CV_32F);
|
||||
allDist.create(query.rows, train.rows, CV_32FC1);
|
||||
ensureSizeIsEnough(nQuery, k, CV_32S, trainIdx);
|
||||
ensureSizeIsEnough(nQuery, k, CV_32F, distance);
|
||||
ensureSizeIsEnough(nQuery, nTrain, CV_32FC1, allDist);
|
||||
}
|
||||
|
||||
trainIdx.setTo(Scalar::all(-1));
|
||||
|
||||
kmatchDispatcher(query, train, k, mask, trainIdx, distance, allDist, distType);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void cv::ocl::BruteForceMatcher_OCL_base::knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector< std::vector<DMatch> > &matches, bool compactResult)
|
||||
@@ -839,33 +846,14 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &quer
|
||||
|
||||
typedef void (*caller_t)(const oclMat & query, const oclMat & trains, const oclMat & masks,
|
||||
const oclMat & trainIdx, const oclMat & imgIdx, const oclMat & distance);
|
||||
#if 0
|
||||
static const caller_t callers[3][6] =
|
||||
{
|
||||
{
|
||||
ocl_match2L1_gpu<unsigned char>, 0/*match2L1_gpu<signed char>*/,
|
||||
ocl_match2L1_gpu<unsigned short>, ocl_match2L1_gpu<short>,
|
||||
ocl_match2L1_gpu<int>, ocl_match2L1_gpu<float>
|
||||
},
|
||||
{
|
||||
0/*match2L2_gpu<unsigned char>*/, 0/*match2L2_gpu<signed char>*/,
|
||||
0/*match2L2_gpu<unsigned short>*/, 0/*match2L2_gpu<short>*/,
|
||||
0/*match2L2_gpu<int>*/, ocl_match2L2_gpu<float>
|
||||
},
|
||||
{
|
||||
ocl_match2Hamming_gpu<unsigned char>, 0/*match2Hamming_gpu<signed char>*/,
|
||||
ocl_match2Hamming_gpu<unsigned short>, 0/*match2Hamming_gpu<short>*/,
|
||||
ocl_match2Hamming_gpu<int>, 0/*match2Hamming_gpu<float>*/
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
|
||||
const int nQuery = query.rows;
|
||||
|
||||
trainIdx.create(1, nQuery, CV_32SC2);
|
||||
imgIdx.create(1, nQuery, CV_32SC2);
|
||||
distance.create(1, nQuery, CV_32SC2);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32SC2, trainIdx);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32SC2, imgIdx);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32FC2, distance);
|
||||
|
||||
trainIdx.setTo(Scalar::all(-1));
|
||||
|
||||
@@ -972,7 +960,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec
|
||||
temp.reserve(2 * k);
|
||||
|
||||
matches.resize(query.rows);
|
||||
std::for_each(matches.begin(), matches.end(), std::bind2nd(std::mem_fun_ref(&std::vector<DMatch>::reserve), k));
|
||||
for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&std::vector<DMatch>::reserve), k));
|
||||
|
||||
for (size_t imgIdx = 0, size = trainDescCollection.size(); imgIdx < size; ++imgIdx)
|
||||
{
|
||||
@@ -996,7 +984,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec
|
||||
|
||||
if (compactResult)
|
||||
{
|
||||
std::vector< std::vector<DMatch> >::iterator new_end = std::remove_if(matches.begin(), matches.end(), std::mem_fun_ref(&std::vector<DMatch>::empty));
|
||||
std::vector< std::vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), mem_fun_ref(&std::vector<DMatch>::empty));
|
||||
matches.erase(new_end, matches.end());
|
||||
}
|
||||
}
|
||||
@@ -1004,36 +992,30 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, std::vec
|
||||
|
||||
// radiusMatchSingle
|
||||
void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchSingle(const oclMat &query, const oclMat &train,
|
||||
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
|
||||
oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, const oclMat &mask)
|
||||
{
|
||||
if (query.empty() || train.empty())
|
||||
return;
|
||||
|
||||
// match1 doesn't support signed char type, match2 only support float, hamming support uchar, ushort and int
|
||||
int callType = query.depth();
|
||||
if (callType != 5)
|
||||
CV_Error(Error::StsUnsupportedFormat, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
|
||||
if ((distType == 0 && callType == 1 ) || (distType == 1 && callType != 5) || (distType == 2 && (callType != 0
|
||||
|| callType != 2 || callType != 4)))
|
||||
{
|
||||
CV_Error(Error::BadDepth, "BruteForceMatch OpenCL only support float type query!\n");
|
||||
}
|
||||
const int nQuery = query.rows;
|
||||
const int nTrain = train.rows;
|
||||
|
||||
CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
|
||||
CV_Assert(train.type() == query.type() && train.cols == query.cols);
|
||||
CV_Assert(trainIdx.empty() || (trainIdx.rows == query.rows && trainIdx.size() == distance.size()));
|
||||
|
||||
nMatches.create(1, query.rows, CV_32SC1);
|
||||
ensureSizeIsEnough(1, nQuery, CV_32SC1, nMatches);
|
||||
if (trainIdx.empty())
|
||||
{
|
||||
trainIdx.create(query.rows, std::max((train.rows/ 100), 10), CV_32SC1);
|
||||
distance.create(query.rows, std::max((train.rows/ 100), 10), CV_32FC1);
|
||||
ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32SC1, trainIdx);
|
||||
ensureSizeIsEnough(nQuery, std::max((nTrain / 100), 10), CV_32FC1, distance);
|
||||
}
|
||||
|
||||
nMatches.setTo(Scalar::all(0));
|
||||
|
||||
matchDispatcher(query, train, maxDistance, mask, trainIdx, distance, nMatches, distType);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches,
|
||||
|
||||
@@ -362,6 +362,13 @@ namespace cv
|
||||
{
|
||||
case WAVEFRONT_SIZE:
|
||||
{
|
||||
bool is_cpu = false;
|
||||
queryDeviceInfo(IS_CPU_DEVICE, &is_cpu);
|
||||
if(is_cpu)
|
||||
{
|
||||
*(int*)info = 1;
|
||||
return;
|
||||
}
|
||||
#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
|
||||
try
|
||||
{
|
||||
|
||||
@@ -47,6 +47,10 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#define MAX_FLOAT 3.40282e+038f
|
||||
|
||||
#ifndef T
|
||||
#define T float
|
||||
#endif
|
||||
|
||||
#ifndef BLOCK_SIZE
|
||||
#define BLOCK_SIZE 16
|
||||
#endif
|
||||
@@ -54,68 +58,85 @@
|
||||
#define MAX_DESC_LEN 64
|
||||
#endif
|
||||
|
||||
int bit1Count(float x)
|
||||
{
|
||||
int c = 0;
|
||||
int ix = (int)x;
|
||||
for (int i = 0 ; i < 32 ; i++)
|
||||
{
|
||||
c += ix & 0x1;
|
||||
ix >>= 1;
|
||||
}
|
||||
return (float)c;
|
||||
}
|
||||
|
||||
#ifndef DIST_TYPE
|
||||
#define DIST_TYPE 0
|
||||
#endif
|
||||
|
||||
#if (DIST_TYPE == 0)
|
||||
#define DIST(x, y) fabs((x) - (y))
|
||||
#elif (DIST_TYPE == 1)
|
||||
#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
|
||||
#elif (DIST_TYPE == 2)
|
||||
#define DIST(x, y) bit1Count((uint)(x) ^ (uint)(y))
|
||||
#endif
|
||||
|
||||
|
||||
float reduce_block(__local float *s_query,
|
||||
__local float *s_train,
|
||||
int lidx,
|
||||
int lidy
|
||||
)
|
||||
//http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
|
||||
int bit1Count(int v)
|
||||
{
|
||||
float result = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||
{
|
||||
result += DIST(s_query[lidy * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
|
||||
}
|
||||
return result;
|
||||
v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
|
||||
v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
|
||||
return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
|
||||
}
|
||||
|
||||
float reduce_multi_block(__local float *s_query,
|
||||
__local float *s_train,
|
||||
int block_index,
|
||||
int lidx,
|
||||
int lidy
|
||||
)
|
||||
// dirty fix for non-template support
|
||||
#if (DIST_TYPE == 0) // L1Dist
|
||||
# ifdef T_FLOAT
|
||||
# define DIST(x, y) fabs((x) - (y))
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
# else
|
||||
# define DIST(x, y) abs((x) - (y))
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
# endif
|
||||
#define DIST_RES(x) (x)
|
||||
#elif (DIST_TYPE == 1) // L2Dist
|
||||
#define DIST(x, y) (((x) - (y)) * ((x) - (y)))
|
||||
typedef float value_type;
|
||||
typedef float result_type;
|
||||
#define DIST_RES(x) sqrt(x)
|
||||
#elif (DIST_TYPE == 2) // Hamming
|
||||
#define DIST(x, y) bit1Count( (x) ^ (y) )
|
||||
typedef int value_type;
|
||||
typedef int result_type;
|
||||
#define DIST_RES(x) (x)
|
||||
#endif
|
||||
|
||||
result_type reduce_block(
|
||||
__local value_type *s_query,
|
||||
__local value_type *s_train,
|
||||
int lidx,
|
||||
int lidy
|
||||
)
|
||||
{
|
||||
float result = 0;
|
||||
result_type result = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||
{
|
||||
result += DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);
|
||||
result += DIST(
|
||||
s_query[lidy * BLOCK_SIZE + j],
|
||||
s_train[j * BLOCK_SIZE + lidx]);
|
||||
}
|
||||
return result;
|
||||
return DIST_RES(result);
|
||||
}
|
||||
|
||||
result_type reduce_multi_block(
|
||||
__local value_type *s_query,
|
||||
__local value_type *s_train,
|
||||
int block_index,
|
||||
int lidx,
|
||||
int lidy
|
||||
)
|
||||
{
|
||||
result_type result = 0;
|
||||
#pragma unroll
|
||||
for (int j = 0 ; j < BLOCK_SIZE ; j++)
|
||||
{
|
||||
result += DIST(
|
||||
s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j],
|
||||
s_train[j * BLOCK_SIZE + lidx]);
|
||||
}
|
||||
return DIST_RES(result);
|
||||
}
|
||||
|
||||
/* 2dim launch, global size: dim0 is (query rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, dim1 is BLOCK_SIZE
|
||||
local size: dim0 is BLOCK_SIZE, dim1 is BLOCK_SIZE.
|
||||
*/
|
||||
__kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
__kernel void BruteForceMatch_UnrollMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
@@ -127,13 +148,12 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
int step
|
||||
)
|
||||
{
|
||||
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
|
||||
int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||
// load the query into local memory.
|
||||
@@ -151,7 +171,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
volatile int imgIdx = 0;
|
||||
for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)
|
||||
{
|
||||
float result = 0;
|
||||
result_type result = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
|
||||
{
|
||||
@@ -207,9 +227,9 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_Match_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
__kernel void BruteForceMatch_Match(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
@@ -230,14 +250,13 @@ __kernel void BruteForceMatch_Match_D5(
|
||||
float myBestDistance = MAX_FLOAT;
|
||||
int myBestTrainIdx = -1;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
// loop
|
||||
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||
{
|
||||
//Dist dist;
|
||||
float result = 0;
|
||||
result_type result = 0;
|
||||
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
@@ -299,9 +318,9 @@ __kernel void BruteForceMatch_Match_D5(
|
||||
}
|
||||
|
||||
//radius_unrollmatch
|
||||
__kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
__kernel void BruteForceMatch_RadiusUnrollMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
@@ -325,10 +344,10 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
||||
const int queryIdx = groupidy * BLOCK_SIZE + lidy;
|
||||
const int trainIdx = groupidx * BLOCK_SIZE + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
float result = 0;
|
||||
result_type result = 0;
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; ++i)
|
||||
{
|
||||
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||
@@ -345,7 +364,8 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
if (queryIdx < query_rows && trainIdx < train_rows &&
|
||||
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||
|
||||
@@ -359,9 +379,9 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
||||
}
|
||||
|
||||
//radius_match
|
||||
__kernel void BruteForceMatch_RadiusMatch_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
__kernel void BruteForceMatch_RadiusMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
float maxDistance,
|
||||
//__global float *mask,
|
||||
__global int *bestTrainIdx,
|
||||
@@ -385,10 +405,10 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
|
||||
const int queryIdx = groupidy * BLOCK_SIZE + lidy;
|
||||
const int trainIdx = groupidx * BLOCK_SIZE + lidx;
|
||||
|
||||
__local float *s_query = sharebuffer;
|
||||
__local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
float result = 0;
|
||||
result_type result = 0;
|
||||
for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)
|
||||
{
|
||||
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||
@@ -405,7 +425,8 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
if (queryIdx < query_rows && trainIdx < train_rows &&
|
||||
convert_float(result) < maxDistance/* && mask(queryIdx, trainIdx)*/)
|
||||
{
|
||||
unsigned int ind = atom_inc(nMatches + queryIdx);
|
||||
|
||||
@@ -419,9 +440,9 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
|
||||
}
|
||||
|
||||
|
||||
__kernel void BruteForceMatch_knnUnrollMatch_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
__kernel void BruteForceMatch_knnUnrollMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
@@ -438,8 +459,8 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;
|
||||
|
||||
// load the query into local memory.
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)
|
||||
@@ -457,10 +478,9 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
||||
volatile int imgIdx = 0;
|
||||
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||
{
|
||||
float result = 0;
|
||||
result_type result = 0;
|
||||
for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)
|
||||
{
|
||||
const int loadX = lidx + i * BLOCK_SIZE;
|
||||
//load a BLOCK_SIZE * BLOCK_SIZE block into local train.
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
s_train[lidx * BLOCK_SIZE + lidy] = loadx < train_cols ? train[min(t * BLOCK_SIZE + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
|
||||
@@ -494,8 +514,8 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
local float *s_distance = (local float *)sharebuffer;
|
||||
local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||
__local float *s_distance = (local float *)sharebuffer;
|
||||
__local int *s_trainIdx = (local int *)(sharebuffer + BLOCK_SIZE * BLOCK_SIZE);
|
||||
|
||||
// find BestMatch
|
||||
s_distance += lidy * BLOCK_SIZE;
|
||||
@@ -565,9 +585,9 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void BruteForceMatch_knnMatch_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
__kernel void BruteForceMatch_knnMatch(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
@@ -584,8 +604,8 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
||||
const int groupidx = get_group_id(0);
|
||||
|
||||
const int queryIdx = groupidx * BLOCK_SIZE + lidy;
|
||||
local float *s_query = sharebuffer;
|
||||
local float *s_train = sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
__local value_type *s_query = (__local value_type *)sharebuffer;
|
||||
__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
float myBestDistance1 = MAX_FLOAT;
|
||||
float myBestDistance2 = MAX_FLOAT;
|
||||
@@ -595,7 +615,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
||||
//loop
|
||||
for (int t = 0 ; t < (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE ; t++)
|
||||
{
|
||||
float result = 0.0f;
|
||||
result_type result = 0.0f;
|
||||
for (int i = 0 ; i < (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE ; i++)
|
||||
{
|
||||
const int loadx = lidx + i * BLOCK_SIZE;
|
||||
@@ -708,9 +728,9 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
kernel void BruteForceMatch_calcDistanceUnrolled(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
@@ -723,9 +743,9 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5(
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_calcDistance_D5(
|
||||
__global float *query,
|
||||
__global float *train,
|
||||
kernel void BruteForceMatch_calcDistance(
|
||||
__global T *query,
|
||||
__global T *train,
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
@@ -738,7 +758,7 @@ kernel void BruteForceMatch_calcDistance_D5(
|
||||
/* Todo */
|
||||
}
|
||||
|
||||
kernel void BruteForceMatch_findBestMatch_D5(
|
||||
kernel void BruteForceMatch_findBestMatch(
|
||||
__global float *allDist,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
@@ -746,4 +766,4 @@ kernel void BruteForceMatch_findBestMatch_D5(
|
||||
)
|
||||
{
|
||||
/* Todo */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,8 +69,10 @@ inline float calc(int x, int y)
|
||||
// dx_buf output dx buffer
|
||||
// dy_buf output dy buffer
|
||||
__kernel
|
||||
void calcSobelRowPass
|
||||
(
|
||||
void
|
||||
__attribute__((reqd_work_group_size(16,16,1)))
|
||||
calcSobelRowPass
|
||||
(
|
||||
__global const uchar * src,
|
||||
__global int * dx_buf,
|
||||
__global int * dy_buf,
|
||||
@@ -82,10 +84,8 @@ __kernel
|
||||
int dx_buf_offset,
|
||||
int dy_buf_step,
|
||||
int dy_buf_offset
|
||||
)
|
||||
)
|
||||
{
|
||||
//src_step /= sizeof(*src);
|
||||
//src_offset /= sizeof(*src);
|
||||
dx_buf_step /= sizeof(*dx_buf);
|
||||
dx_buf_offset /= sizeof(*dx_buf);
|
||||
dy_buf_step /= sizeof(*dy_buf);
|
||||
@@ -99,24 +99,23 @@ __kernel
|
||||
|
||||
__local int smem[16][18];
|
||||
|
||||
smem[lidy][lidx + 1] = src[gidx + gidy * src_step + src_offset];
|
||||
smem[lidy][lidx + 1] =
|
||||
src[gidx + min(gidy, rows - 1) * src_step + src_offset];
|
||||
if(lidx == 0)
|
||||
{
|
||||
smem[lidy][0] = src[max(gidx - 1, 0) + gidy * src_step + src_offset];
|
||||
smem[lidy][17] = src[min(gidx + 16, cols - 1) + gidy * src_step + src_offset];
|
||||
smem[lidy][0] =
|
||||
src[max(gidx - 1, 0) + min(gidy, rows - 1) * src_step + src_offset];
|
||||
smem[lidy][17] =
|
||||
src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(gidy < rows)
|
||||
if(gidy < rows && gidx < cols)
|
||||
{
|
||||
|
||||
if(gidx < cols)
|
||||
{
|
||||
dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
|
||||
-smem[lidy][lidx] + smem[lidy][lidx + 2];
|
||||
dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
|
||||
smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
|
||||
}
|
||||
dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
|
||||
-smem[lidy][lidx] + smem[lidy][lidx + 2];
|
||||
dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
|
||||
smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,8 +128,10 @@ __kernel
|
||||
// dy direvitive in y direction output
|
||||
// mag magnitude direvitive of xy output
|
||||
__kernel
|
||||
void calcMagnitude_buf
|
||||
(
|
||||
void
|
||||
__attribute__((reqd_work_group_size(16,16,1)))
|
||||
calcMagnitude_buf
|
||||
(
|
||||
__global const int * dx_buf,
|
||||
__global const int * dy_buf,
|
||||
__global int * dx,
|
||||
@@ -148,7 +149,7 @@ __kernel
|
||||
int dy_offset,
|
||||
int mag_step,
|
||||
int mag_offset
|
||||
)
|
||||
)
|
||||
{
|
||||
dx_buf_step /= sizeof(*dx_buf);
|
||||
dx_buf_offset /= sizeof(*dx_buf);
|
||||
@@ -170,30 +171,33 @@ __kernel
|
||||
__local int sdx[18][16];
|
||||
__local int sdy[18][16];
|
||||
|
||||
sdx[lidy + 1][lidx] = dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset];
|
||||
sdy[lidy + 1][lidx] = dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset];
|
||||
sdx[lidy + 1][lidx] =
|
||||
dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
|
||||
sdy[lidy + 1][lidx] =
|
||||
dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
|
||||
if(lidy == 0)
|
||||
{
|
||||
sdx[0][lidx] = dx_buf[gidx + max(gidy - 1, 0) * dx_buf_step + dx_buf_offset];
|
||||
sdx[17][lidx] = dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
|
||||
sdx[0][lidx] =
|
||||
dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset];
|
||||
sdx[17][lidx] =
|
||||
dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset];
|
||||
|
||||
sdy[0][lidx] = dy_buf[gidx + max(gidy - 1, 0) * dy_buf_step + dy_buf_offset];
|
||||
sdy[17][lidx] = dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
|
||||
sdy[0][lidx] =
|
||||
dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset];
|
||||
sdy[17][lidx] =
|
||||
dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(gidx < cols)
|
||||
if(gidx < cols && gidy < rows)
|
||||
{
|
||||
if(gidy < rows)
|
||||
{
|
||||
int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
|
||||
int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
|
||||
int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
|
||||
int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
|
||||
|
||||
dx[gidx + gidy * dx_step + dx_offset] = x;
|
||||
dy[gidx + gidy * dy_step + dy_offset] = y;
|
||||
dx[gidx + gidy * dx_step + dx_offset] = x;
|
||||
dy[gidx + gidy * dy_step + dy_offset] = y;
|
||||
|
||||
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
|
||||
}
|
||||
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,8 +210,8 @@ __kernel
|
||||
// dy direvitive in y direction output
|
||||
// mag magnitude direvitive of xy output
|
||||
__kernel
|
||||
void calcMagnitude
|
||||
(
|
||||
void calcMagnitude
|
||||
(
|
||||
__global const int * dx,
|
||||
__global const int * dy,
|
||||
__global float * mag,
|
||||
@@ -219,7 +223,7 @@ __kernel
|
||||
int dy_offset,
|
||||
int mag_step,
|
||||
int mag_offset
|
||||
)
|
||||
)
|
||||
{
|
||||
dx_step /= sizeof(*dx);
|
||||
dx_offset /= sizeof(*dx);
|
||||
@@ -235,9 +239,9 @@ __kernel
|
||||
{
|
||||
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
|
||||
calc(
|
||||
dx[gidx + gidy * dx_step + dx_offset],
|
||||
dy[gidx + gidy * dy_step + dy_offset]
|
||||
);
|
||||
dx[gidx + gidy * dx_step + dx_offset],
|
||||
dy[gidx + gidy * dy_step + dy_offset]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -262,8 +266,10 @@ __kernel
|
||||
// mag magnitudes calculated from calcMagnitude function
|
||||
// map output containing raw edge types
|
||||
__kernel
|
||||
void calcMap
|
||||
(
|
||||
void
|
||||
__attribute__((reqd_work_group_size(16,16,1)))
|
||||
calcMap
|
||||
(
|
||||
__global const int * dx,
|
||||
__global const int * dy,
|
||||
__global const float * mag,
|
||||
@@ -280,7 +286,7 @@ __kernel
|
||||
int mag_offset,
|
||||
int map_step,
|
||||
int map_offset
|
||||
)
|
||||
)
|
||||
{
|
||||
dx_step /= sizeof(*dx);
|
||||
dx_offset /= sizeof(*dx);
|
||||
@@ -307,11 +313,13 @@ __kernel
|
||||
int ly = tid / 18;
|
||||
if(ly < 14)
|
||||
{
|
||||
smem[ly][lx] = mag[grp_idx + lx + (grp_idy + ly) * mag_step];
|
||||
smem[ly][lx] =
|
||||
mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step];
|
||||
}
|
||||
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
|
||||
{
|
||||
smem[ly + 14][lx] = mag[grp_idx + lx + (grp_idy + ly + 14) * mag_step];
|
||||
smem[ly + 14][lx] =
|
||||
mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -375,8 +383,10 @@ __kernel
|
||||
// st the potiential edge points found in this kernel call
|
||||
// counter the number of potiential edge points
|
||||
__kernel
|
||||
void edgesHysteresisLocal
|
||||
(
|
||||
void
|
||||
__attribute__((reqd_work_group_size(16,16,1)))
|
||||
edgesHysteresisLocal
|
||||
(
|
||||
__global int * map,
|
||||
__global ushort2 * st,
|
||||
volatile __global unsigned int * counter,
|
||||
@@ -384,7 +394,7 @@ __kernel
|
||||
int cols,
|
||||
int map_step,
|
||||
int map_offset
|
||||
)
|
||||
)
|
||||
{
|
||||
map_step /= sizeof(*map);
|
||||
map_offset /= sizeof(*map);
|
||||
@@ -405,11 +415,13 @@ __kernel
|
||||
int ly = tid / 18;
|
||||
if(ly < 14)
|
||||
{
|
||||
smem[ly][lx] = map[grp_idx + lx + (grp_idy + ly) * map_step + map_offset];
|
||||
smem[ly][lx] =
|
||||
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step + map_offset];
|
||||
}
|
||||
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
|
||||
{
|
||||
smem[ly + 14][lx] = map[grp_idx + lx + (grp_idy + ly + 14) * map_step + map_offset];
|
||||
smem[ly + 14][lx] =
|
||||
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step + map_offset];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
@@ -472,8 +484,8 @@ __constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1};
|
||||
|
||||
#define stack_size 512
|
||||
__kernel
|
||||
void edgesHysteresisGlobal
|
||||
(
|
||||
void edgesHysteresisGlobal
|
||||
(
|
||||
__global int * map,
|
||||
__global ushort2 * st1,
|
||||
__global ushort2 * st2,
|
||||
@@ -483,7 +495,7 @@ __kernel
|
||||
int count,
|
||||
int map_step,
|
||||
int map_offset
|
||||
)
|
||||
)
|
||||
{
|
||||
|
||||
map_step /= sizeof(*map);
|
||||
@@ -535,7 +547,7 @@ __kernel
|
||||
while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
|
||||
{
|
||||
const int subTaskIdx = lidx >> 3;
|
||||
const int portion = min(s_counter, get_local_size(0)>> 3);
|
||||
const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
|
||||
|
||||
pos.x = pos.y = 0;
|
||||
|
||||
@@ -589,8 +601,8 @@ __kernel
|
||||
// map edge type mappings
|
||||
// dst edge output
|
||||
__kernel
|
||||
void getEdges
|
||||
(
|
||||
void getEdges
|
||||
(
|
||||
__global const int * map,
|
||||
__global uchar * dst,
|
||||
int rows,
|
||||
@@ -599,19 +611,16 @@ __kernel
|
||||
int map_offset,
|
||||
int dst_step,
|
||||
int dst_offset
|
||||
)
|
||||
)
|
||||
{
|
||||
map_step /= sizeof(*map);
|
||||
map_offset /= sizeof(*map);
|
||||
//dst_step /= sizeof(*dst);
|
||||
//dst_offset /= sizeof(*dst);
|
||||
|
||||
int gidx = get_global_id(0);
|
||||
int gidy = get_global_id(1);
|
||||
|
||||
if(gidy < rows && gidx < cols)
|
||||
{
|
||||
//dst[gidx + gidy * dst_step] = map[gidx + 1 + (gidy + 1) * map_step] == 2 ? 255: 0;
|
||||
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] / 2));
|
||||
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step] >> 1));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user