add bruteForceMatcher to ocl module

This commit is contained in:
yao 2012-09-12 11:40:13 +08:00
parent 23244a3565
commit 656f06fa74
4 changed files with 2949 additions and 0 deletions

View File

@ -946,6 +946,186 @@ namespace cv
oclMat maxPosBuffer;
};
////////////////////////////////// BruteForceMatcher //////////////////////////////////
class CV_EXPORTS BruteForceMatcher_OCL_base
{
public:
enum DistType {L1Dist = 0, L2Dist, HammingDist};
explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist);
// Add descriptors to train descriptor collection
void add(const std::vector<oclMat>& descCollection);
// Get train descriptors collection
const std::vector<oclMat>& getTrainDescriptors() const;
// Clear train descriptors collection
void clear();
// Return true if there are not train descriptors in collection
bool empty() const;
// Return true if the matcher supports mask in match methods
bool isMaskSupported() const;
// Find one best match for each query descriptor
void matchSingle(const oclMat& query, const oclMat& train,
oclMat& trainIdx, oclMat& distance,
const oclMat& mask = oclMat());
// Download trainIdx and distance and convert it to CPU vector with DMatch
static void matchDownload(const oclMat& trainIdx, const oclMat& distance, std::vector<DMatch>& matches);
// Convert trainIdx and distance to vector with DMatch
static void matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>& matches);
// Find one best match for each query descriptor
void match(const oclMat& query, const oclMat& train, std::vector<DMatch>& matches, const oclMat& mask = oclMat());
// Make gpu collection of trains and masks in suitable format for matchCollection function
void makeGpuCollection(oclMat& trainCollection, oclMat& maskCollection, const std::vector<oclMat>& masks = std::vector<oclMat>());
// Find one best match from train collection for each query descriptor
void matchCollection(const oclMat& query, const oclMat& trainCollection,
oclMat& trainIdx, oclMat& imgIdx, oclMat& distance,
const oclMat& masks = oclMat());
// Download trainIdx, imgIdx and distance and convert it to vector with DMatch
static void matchDownload(const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance, std::vector<DMatch>& matches);
// Convert trainIdx, imgIdx and distance to vector with DMatch
static void matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>& matches);
// Find one best match from train collection for each query descriptor.
void match(const oclMat& query, std::vector<DMatch>& matches, const std::vector<oclMat>& masks = std::vector<oclMat>());
// Find k best matches for each query descriptor (in increasing order of distances)
void knnMatchSingle(const oclMat& query, const oclMat& train,
oclMat& trainIdx, oclMat& distance, oclMat& allDist, int k,
const oclMat& mask = oclMat());
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void knnMatchDownload(const oclMat& trainIdx, const oclMat& distance,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Convert trainIdx and distance to vector with DMatch
static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void knnMatch(const oclMat& query, const oclMat& train,
std::vector< std::vector<DMatch> >& matches, int k, const oclMat& mask = oclMat(),
bool compactResult = false);
// Find k best matches from train collection for each query descriptor (in increasing order of distances)
void knnMatch2Collection(const oclMat& query, const oclMat& trainCollection,
oclMat& trainIdx, oclMat& imgIdx, oclMat& distance,
const oclMat& maskCollection = oclMat());
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void knnMatch2Download(const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Convert trainIdx and distance to vector with DMatch
static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void knnMatch(const oclMat& query, std::vector< std::vector<DMatch> >& matches, int k,
const std::vector<oclMat>& masks = std::vector<oclMat>(), bool compactResult = false);
// Find best matches for each query descriptor which have distance less than maxDistance.
// nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
// carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
// because it didn't have enough memory.
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
void radiusMatchSingle(const oclMat& query, const oclMat& train,
oclMat& trainIdx, oclMat& distance, oclMat& nMatches, float maxDistance,
const oclMat& mask = oclMat());
// Download trainIdx, nMatches and distance and convert it to vector with DMatch.
// matches will be sorted in increasing order of distances.
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void radiusMatchDownload(const oclMat& trainIdx, const oclMat& distance, const oclMat& nMatches,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Convert trainIdx, nMatches and distance to vector with DMatch.
static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Find best matches for each query descriptor which have distance less than maxDistance
// in increasing order of distances).
void radiusMatch(const oclMat& query, const oclMat& train,
std::vector< std::vector<DMatch> >& matches, float maxDistance,
const oclMat& mask = oclMat(), bool compactResult = false);
// Find best matches for each query descriptor which have distance less than maxDistance.
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
void radiusMatchCollection(const oclMat& query, oclMat& trainIdx, oclMat& imgIdx, oclMat& distance, oclMat& nMatches, float maxDistance,
const std::vector<oclMat>& masks = std::vector<oclMat>());
// Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
// matches will be sorted in increasing order of distances.
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void radiusMatchDownload(const oclMat& trainIdx, const oclMat& imgIdx, const oclMat& distance, const oclMat& nMatches,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Convert trainIdx, nMatches and distance to vector with DMatch.
static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Find best matches from train collection for each query descriptor which have distance less than
// maxDistance (in increasing order of distances).
void radiusMatch(const oclMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
const std::vector<oclMat>& masks = std::vector<oclMat>(), bool compactResult = false);
DistType distType;
private:
std::vector<oclMat> trainDescCollection;
};
template <class Distance>
class CV_EXPORTS BruteForceMatcher_OCL;
template <typename T>
class CV_EXPORTS BruteForceMatcher_OCL< L1<T> > : public BruteForceMatcher_OCL_base
{
public:
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {}
explicit BruteForceMatcher_OCL(L1<T> /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {}
};
template <typename T>
class CV_EXPORTS BruteForceMatcher_OCL< L2<T> > : public BruteForceMatcher_OCL_base
{
public:
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {}
explicit BruteForceMatcher_OCL(L2<T> /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {}
};
template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base
{
public:
explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {}
explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {}
};
}
}
#include "opencv2/ocl/matrix_operations.hpp"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,816 @@
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
#define MAX_FLOAT 1e7f
int bit1Count(float x)
{
int c = 0;
int ix = (int)x;
for (int i = 0 ; i < 32 ; i++)
{
c += ix & 0x1;
ix >>= 1;
}
return (float)c;
}
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
local size: dim0 is block_size, dim1 is block_size.
*/
__kernel void BruteForceMatch_UnrollMatch(
__global float *query,
__global float *train,
__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * max_desc_len;
int queryIdx = groupidx * block_size + lidy;
// load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++)
{
int loadx = lidx + i * block_size;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
}
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
// loopUnrolledCached to find the best trainIdx and best distance.
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
{
//bestImgIdx = imgIdx;
myBestDistance = result;
myBestTrainIdx = trainIdx;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float*)(sharebuffer);
__local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//find BestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++)
{
if (myBestDistance > s_distance[k])
{
myBestDistance = s_distance[k];
myBestTrainIdx = s_trainIdx[k];
}
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
__kernel void BruteForceMatch_Match(
__global float *query,
__global float *train,
__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
float myBestDistance = MAX_FLOAT;
int myBestTrainIdx = -1;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
// loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
//Dist dist;
float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
{
const int loadx = lidx + i * block_size;
//load query and train into local memory
s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
{
//myBestImgidx = imgIdx;
myBestDistance = result;
myBestTrainIdx = trainIdx;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//findBestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance;
s_trainIdx[lidx] = myBestTrainIdx;
barrier(CLK_LOCAL_MEM_FENCE);
//reduce -- now all reduce implement in each threads.
for (int k = 0 ; k < block_size; k++)
{
if (myBestDistance > s_distance[k])
{
myBestDistance = s_distance[k];
myBestTrainIdx = s_trainIdx[k];
}
}
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
//radius_unrollmatch
__kernel void BruteForceMatch_RadiusUnrollMatch(
__global float *query,
__global float *train,
float maxDistance,
__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int bestTrainIdx_cols,
int step,
int ostep,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{
case 0:
for (int j = 0 ; j < block_size ; ++j)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; ++j)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; ++j)
{
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if(ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
}
//radius_match
__kernel void BruteForceMatch_RadiusMatch(
__global float *query,
__global float *train,
float maxDistance,
__global float *mask,
__global int *bestTrainIdx,
__global float *bestDistance,
__global int *nMatches,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int bestTrainIdx_cols,
int step,
int ostep,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int groupidy = get_group_id(1);
const int queryIdx = groupidy * block_size + lidy;
const int trainIdx = groupidx * block_size + lidx;
__local float *s_query = sharebuffer;
__local float *s_train = sharebuffer + block_size * block_size;
float result = 0;
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
{
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{
case 0:
for (int j = 0 ; j < block_size ; ++j)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; ++j)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; ++j)
{
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
{
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
if(ind < bestTrainIdx_cols)
{
//bestImgIdx = imgIdx;
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
}
}
}
__kernel void BruteForceMatch_knnUnrollMatch(
__global float *query,
__global float *train,
__global float *mask,
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * max_desc_len;
// load the query into local memory.
for (int i = 0 ; i < max_desc_len / block_size; i ++)
{
int loadx = lidx + i * block_size;
s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
}
float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
//loopUnrolledCached
volatile int imgIdx = 0;
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0;
for (int i = 0 ; i < max_desc_len / block_size ; i++)
{
const int loadX = lidx + i * block_size;
//load a block_size * block_size block into local train.
const int loadx = lidx + i * block_size;
s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
//synchronize to make sure each elem for reduceIteration in share memory is written already.
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows)
{
if (result < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = result;
myBestTrainIdx1 = trainIdx;
}
else if (result < myBestDistance2)
{
myBestDistance2 = result;
myBestTrainIdx2 = trainIdx;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
local float *s_distance = (local float *)sharebuffer;
local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
// find BestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
float bestDistance1 = MAX_FLOAT;
float bestDistance2 = MAX_FLOAT;
int bestTrainIdx1 = -1;
int bestTrainIdx2 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance1)
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = val;
bestTrainIdx1 = s_trainIdx[i];
}
else if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance2;
s_trainIdx[lidx] = myBestTrainIdx2;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
myBestDistance1 = bestDistance1;
myBestDistance2 = bestDistance2;
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
}
__kernel void BruteForceMatch_knnMatch(
__global float *query,
__global float *train,
__global float *mask,
__global int2 *bestTrainIdx,
__global float2 *bestDistance,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType
)
{
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
const int groupidx = get_group_id(0);
const int queryIdx = groupidx * block_size + lidy;
local float *s_query = sharebuffer;
local float *s_train = sharebuffer + block_size * block_size;
float myBestDistance1 = MAX_FLOAT;
float myBestDistance2 = MAX_FLOAT;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
//loop
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
{
float result = 0.0f;
for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
{
const int loadx = lidx + i * block_size;
//load query and train into local memory
s_query[lidy * block_size + lidx] = 0;
s_train[lidx * block_size + lidy] = 0;
if (loadx < query_cols)
{
s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
}
barrier(CLK_LOCAL_MEM_FENCE);
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
switch(distType)
{
case 0:
for (int j = 0 ; j < block_size ; j++)
{
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
}
break;
case 1:
for (int j = 0 ; j < block_size ; j++)
{
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
result += qr * qr;
}
break;
case 2:
for (int j = 0 ; j < block_size ; j++)
{
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
}
break;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
const int trainIdx = t * block_size + lidx;
if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
{
if (result < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = result;
myBestTrainIdx1 = trainIdx;
}
else if (result < myBestDistance2)
{
myBestDistance2 = result;
myBestTrainIdx2 = trainIdx;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
__local float *s_distance = (__local float *)sharebuffer;
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
//findBestMatch
s_distance += lidy * block_size;
s_trainIdx += lidy * block_size;
s_distance[lidx] = myBestDistance1;
s_trainIdx[lidx] = myBestTrainIdx1;
float bestDistance1 = MAX_FLOAT;
float bestDistance2 = MAX_FLOAT;
int bestTrainIdx1 = -1;
int bestTrainIdx2 = -1;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance1)
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = val;
bestTrainIdx1 = s_trainIdx[i];
}
else if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
s_distance[lidx] = myBestDistance2;
s_trainIdx[lidx] = myBestTrainIdx2;
barrier(CLK_LOCAL_MEM_FENCE);
if (lidx == 0)
{
for (int i = 0 ; i < block_size ; i++)
{
float val = s_distance[i];
if (val < bestDistance2)
{
bestDistance2 = val;
bestTrainIdx2 = s_trainIdx[i];
}
}
}
myBestDistance1 = bestDistance1;
myBestDistance2 = bestDistance2;
myBestTrainIdx1 = bestTrainIdx1;
myBestTrainIdx2 = bestTrainIdx2;
if (queryIdx < query_rows && lidx == 0)
{
bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
}
}
kernel void BruteForceMatch_calcDistanceUnrolled(
__global float *query,
__global float *train,
__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int max_desc_len,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType)
{
/* Todo */
}
kernel void BruteForceMatch_calcDistance(
__global float *query,
__global float *train,
__global float *mask,
__global float *allDist,
__local float *sharebuffer,
int block_size,
int query_rows,
int query_cols,
int train_rows,
int train_cols,
int step,
int distType)
{
/* Todo */
}
kernel void BruteForceMatch_findBestMatch(
__global float *allDist,
__global int *bestTrainIdx,
__global float *bestDistance,
int k,
int block_size
)
{
/* Todo */
}

View File

@ -0,0 +1,219 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// Intel License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
namespace {
/////////////////////////////////////////////////////////////////////////////////////////////////
// BruteForceMatcher
CV_ENUM(DistType, cv::ocl::BruteForceMatcher_OCL_base::L1Dist, cv::ocl::BruteForceMatcher_OCL_base::L2Dist, cv::ocl::BruteForceMatcher_OCL_base::HammingDist)
IMPLEMENT_PARAM_CLASS(DescriptorSize, int)
PARAM_TEST_CASE(BruteForceMatcher/*, NormCode*/, DistType, DescriptorSize)
{
//std::vector<cv::ocl::Info> oclinfo;
cv::ocl::BruteForceMatcher_OCL_base::DistType distType;
int normCode;
int dim;
int queryDescCount;
int countFactor;
cv::Mat query, train;
virtual void SetUp()
{
//normCode = GET_PARAM(0);
distType = (cv::ocl::BruteForceMatcher_OCL_base::DistType)(int)GET_PARAM(0);
dim = GET_PARAM(1);
//int devnums = getDevice(oclinfo, OPENCV_DEFAULT_OPENCL_DEVICE);
//CV_Assert(devnums > 0);
queryDescCount = 300; // must be even number because we split train data in some cases in two
countFactor = 4; // do not change it
cv::RNG& rng = cvtest::TS::ptr()->get_rng();
cv::Mat queryBuf, trainBuf;
// Generate query descriptors randomly.
// Descriptor vector elements are integer values.
queryBuf.create(queryDescCount, dim, CV_32SC1);
rng.fill(queryBuf, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(3));
queryBuf.convertTo(queryBuf, CV_32FC1);
// Generate train decriptors as follows:
// copy each query descriptor to train set countFactor times
// and perturb some one element of the copied descriptors in
// in ascending order. General boundaries of the perturbation
// are (0.f, 1.f).
trainBuf.create(queryDescCount * countFactor, dim, CV_32FC1);
float step = 1.f / countFactor;
for (int qIdx = 0; qIdx < queryDescCount; qIdx++)
{
cv::Mat queryDescriptor = queryBuf.row(qIdx);
for (int c = 0; c < countFactor; c++)
{
int tIdx = qIdx * countFactor + c;
cv::Mat trainDescriptor = trainBuf.row(tIdx);
queryDescriptor.copyTo(trainDescriptor);
int elem = rng(dim);
float diff = rng.uniform(step * c, step * (c + 1));
trainDescriptor.at<float>(0, elem) += diff;
}
}
queryBuf.convertTo(query, CV_32F);
trainBuf.convertTo(train, CV_32F);
}
};
TEST_P(BruteForceMatcher, Match_Single)
{
cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
std::vector<cv::DMatch> matches;
matcher.match(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches);
ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
int badCount = 0;
for (size_t i = 0; i < matches.size(); i++)
{
cv::DMatch match = matches[i];
if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor) || (match.imgIdx != 0))
badCount++;
}
ASSERT_EQ(0, badCount);
}
TEST_P(BruteForceMatcher, KnnMatch_2_Single)
{
const int knn = 2;
cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
std::vector< std::vector<cv::DMatch> > matches;
matcher.knnMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, knn);
ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
int badCount = 0;
for (size_t i = 0; i < matches.size(); i++)
{
if ((int)matches[i].size() != knn)
badCount++;
else
{
int localBadCount = 0;
for (int k = 0; k < knn; k++)
{
cv::DMatch match = matches[i][k];
if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i * countFactor + k) || (match.imgIdx != 0))
localBadCount++;
}
badCount += localBadCount > 0 ? 1 : 0;
}
}
ASSERT_EQ(0, badCount);
}
TEST_P(BruteForceMatcher, RadiusMatch_Single)
{
float radius;
if(distType == cv::ocl::BruteForceMatcher_OCL_base::L2Dist)
radius = 1.f / countFactor /countFactor;
else
radius = 1.f / countFactor;
cv::ocl::BruteForceMatcher_OCL_base matcher(distType);
// assume support atomic.
//if (!supportFeature(devInfo, cv::gpu::GLOBAL_ATOMICS))
//{
// try
// {
// std::vector< std::vector<cv::DMatch> > matches;
// matcher.radiusMatch(loadMat(query), loadMat(train), matches, radius);
// }
// catch (const cv::Exception& e)
// {
// ASSERT_EQ(CV_StsNotImplemented, e.code);
// }
//}
//else
{
std::vector< std::vector<cv::DMatch> > matches;
matcher.radiusMatch(cv::ocl::oclMat(query), cv::ocl::oclMat(train), matches, radius);
ASSERT_EQ(static_cast<size_t>(queryDescCount), matches.size());
int badCount = 0;
for (size_t i = 0; i < matches.size(); i++)
{
if ((int)matches[i].size() != 1)
{
badCount++;
}
else
{
cv::DMatch match = matches[i][0];
if ((match.queryIdx != (int)i) || (match.trainIdx != (int)i*countFactor) || (match.imgIdx != 0))
badCount++;
}
}
ASSERT_EQ(0, badCount);
}
}
INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
//ALL_DEVICES,
testing::Values(DistType(cv::ocl::BruteForceMatcher_OCL_base::L1Dist), DistType(cv::ocl::BruteForceMatcher_OCL_base::L2Dist)),
testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304))));
} // namespace