Fix ocl::bruteforcematcher crash on Intel OCL
This commit is contained in:
parent
620c699456
commit
504008dbe0
modules/ocl/src
@ -51,7 +51,6 @@ using namespace cv;
|
|||||||
using namespace cv::ocl;
|
using namespace cv::ocl;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
namespace cv
|
namespace cv
|
||||||
{
|
{
|
||||||
namespace ocl
|
namespace ocl
|
||||||
@ -62,7 +61,7 @@ namespace cv
|
|||||||
}
|
}
|
||||||
|
|
||||||
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
||||||
void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
|
void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||||
const oclMat &trainIdx, const oclMat &distance, int distType)
|
const oclMat &trainIdx, const oclMat &distance, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
@ -77,7 +76,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
|
|||||||
{
|
{
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||||
@ -103,7 +102,7 @@ void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
||||||
void match(const oclMat &query, const oclMat &train, const oclMat &mask,
|
void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||||
const oclMat &trainIdx, const oclMat &distance, int distType)
|
const oclMat &trainIdx, const oclMat &distance, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
@ -117,7 +116,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask,
|
|||||||
{
|
{
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||||
@ -143,7 +142,7 @@ void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const o
|
|||||||
|
|
||||||
//radius_matchUnrolledCached
|
//radius_matchUnrolledCached
|
||||||
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
||||||
void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
|
void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
|
||||||
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
|
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
@ -159,7 +158,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
|||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
|
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
||||||
@ -183,7 +182,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
|||||||
|
|
||||||
//radius_match
|
//radius_match
|
||||||
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
||||||
void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &mask,
|
void radius_match(const oclMat &query, const oclMat &train, float maxDistance, const oclMat &/*mask*/,
|
||||||
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
|
const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
@ -198,7 +197,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
|
|||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
|
args.push_back( make_pair( sizeof(cl_float), (void *)&maxDistance ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
||||||
@ -472,7 +471,7 @@ void matchDispatcher(const oclMat &query, const oclMat &train, int n, float maxD
|
|||||||
|
|
||||||
//knn match Dispatcher
|
//knn match Dispatcher
|
||||||
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
||||||
void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &mask,
|
void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||||
const oclMat &trainIdx, const oclMat &distance, int distType)
|
const oclMat &trainIdx, const oclMat &distance, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
@ -487,7 +486,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
|
|||||||
{
|
{
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||||
@ -507,7 +506,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
|
|||||||
}
|
}
|
||||||
|
|
||||||
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
||||||
void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
|
void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||||
const oclMat &trainIdx, const oclMat &distance, int distType)
|
const oclMat &trainIdx, const oclMat &distance, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
@ -521,7 +520,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
|
|||||||
{
|
{
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||||
@ -540,7 +539,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &mask,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
|
||||||
void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
|
void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
|
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||||
@ -554,7 +553,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
|
|||||||
{
|
{
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
|
||||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||||
@ -573,7 +572,7 @@ void calcDistanceUnrolled(const oclMat &query, const oclMat &train, const oclMat
|
|||||||
}
|
}
|
||||||
|
|
||||||
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
|
||||||
void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask, const oclMat &allDist, int distType)
|
void calcDistance(const oclMat &query, const oclMat &train, const oclMat &/*mask*/, const oclMat &allDist, int distType)
|
||||||
{
|
{
|
||||||
cv::ocl::Context *ctx = query.clCxt;
|
cv::ocl::Context *ctx = query.clCxt;
|
||||||
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
|
size_t globalSize[] = {(query.rows + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE, BLOCK_SIZE, 1};
|
||||||
@ -586,7 +585,7 @@ void calcDistance(const oclMat &query, const oclMat &train, const oclMat &mask,
|
|||||||
{
|
{
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&train.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
//args.push_back( make_pair( sizeof(cl_mem), (void *)&mask.data ));
|
||||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
|
args.push_back( make_pair( sizeof(cl_mem), (void *)&allDist.data ));
|
||||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||||
@ -691,7 +690,7 @@ void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const o
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
|
void findKnnMatchDispatcher(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType)
|
||||||
{
|
{
|
||||||
findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
|
findKnnMatch<256>(k, trainIdx, distance, allDist, distType);
|
||||||
}
|
}
|
||||||
@ -1007,6 +1006,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::matchConvert(const Mat &trainIdx, cons
|
|||||||
|
|
||||||
void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector<DMatch> &matches, const oclMat &mask)
|
void cv::ocl::BruteForceMatcher_OCL_base::match(const oclMat &query, const oclMat &train, vector<DMatch> &matches, const oclMat &mask)
|
||||||
{
|
{
|
||||||
|
assert(mask.empty()); // mask is not supported at the moment
|
||||||
oclMat trainIdx, distance;
|
oclMat trainIdx, distance;
|
||||||
matchSingle(query, train, trainIdx, distance, mask);
|
matchSingle(query, train, trainIdx, distance, mask);
|
||||||
matchDownload(trainIdx, distance, matches);
|
matchDownload(trainIdx, distance, matches);
|
||||||
@ -1697,3 +1697,5 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, vecto
|
|||||||
radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
|
radiusMatchCollection(query, trainIdx, imgIdx, distance, nMatches, maxDistance, masks);
|
||||||
radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
|
radiusMatchDownload(trainIdx, imgIdx, distance, nMatches, matches, compactResult);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,11 +5,13 @@ int bit1Count(float x)
|
|||||||
{
|
{
|
||||||
int c = 0;
|
int c = 0;
|
||||||
int ix = (int)x;
|
int ix = (int)x;
|
||||||
|
|
||||||
for (int i = 0 ; i < 32 ; i++)
|
for (int i = 0 ; i < 32 ; i++)
|
||||||
{
|
{
|
||||||
c += ix & 0x1;
|
c += ix & 0x1;
|
||||||
ix >>= 1;
|
ix >>= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (float)c;
|
return (float)c;
|
||||||
}
|
}
|
||||||
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
|
/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
|
||||||
@ -18,7 +20,7 @@ local size: dim0 is block_size, dim1 is block_size.
|
|||||||
__kernel void BruteForceMatch_UnrollMatch(
|
__kernel void BruteForceMatch_UnrollMatch(
|
||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global int *bestTrainIdx,
|
__global int *bestTrainIdx,
|
||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
@ -30,7 +32,7 @@ __kernel void BruteForceMatch_UnrollMatch(
|
|||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step,
|
||||||
int distType
|
int distType
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
@ -40,6 +42,7 @@ __kernel void BruteForceMatch_UnrollMatch(
|
|||||||
__local float *s_train = sharebuffer + block_size * max_desc_len;
|
__local float *s_train = sharebuffer + block_size * max_desc_len;
|
||||||
|
|
||||||
int queryIdx = groupidx * block_size + lidy;
|
int queryIdx = groupidx * block_size + lidy;
|
||||||
|
|
||||||
// load the query into local memory.
|
// load the query into local memory.
|
||||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||||
{
|
{
|
||||||
@ -52,9 +55,11 @@ __kernel void BruteForceMatch_UnrollMatch(
|
|||||||
|
|
||||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||||
volatile int imgIdx = 0;
|
volatile int imgIdx = 0;
|
||||||
|
|
||||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||||
{
|
{
|
||||||
float result = 0;
|
float result = 0;
|
||||||
|
|
||||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||||
{
|
{
|
||||||
//load a block_size * block_size block into local train.
|
//load a block_size * block_size block into local train.
|
||||||
@ -67,27 +72,33 @@ __kernel void BruteForceMatch_UnrollMatch(
|
|||||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||||
|
|
||||||
switch(distType)
|
switch (distType)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||||
result += qr * qr;
|
result += qr * qr;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,8 +116,8 @@ __kernel void BruteForceMatch_UnrollMatch(
|
|||||||
}
|
}
|
||||||
|
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
__local float *s_distance = (__local float*)(sharebuffer);
|
__local float *s_distance = (__local float *)(sharebuffer);
|
||||||
__local int* s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
__local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
|
||||||
|
|
||||||
//find BestMatch
|
//find BestMatch
|
||||||
s_distance += lidy * block_size;
|
s_distance += lidy * block_size;
|
||||||
@ -136,7 +147,7 @@ __kernel void BruteForceMatch_UnrollMatch(
|
|||||||
__kernel void BruteForceMatch_Match(
|
__kernel void BruteForceMatch_Match(
|
||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global int *bestTrainIdx,
|
__global int *bestTrainIdx,
|
||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
@ -147,7 +158,7 @@ __kernel void BruteForceMatch_Match(
|
|||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step,
|
||||||
int distType
|
int distType
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
@ -166,6 +177,7 @@ __kernel void BruteForceMatch_Match(
|
|||||||
{
|
{
|
||||||
//Dist dist;
|
//Dist dist;
|
||||||
float result = 0;
|
float result = 0;
|
||||||
|
|
||||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||||
{
|
{
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * block_size;
|
||||||
@ -184,27 +196,33 @@ __kernel void BruteForceMatch_Match(
|
|||||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||||
|
|
||||||
switch(distType)
|
switch (distType)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||||
result += qr * qr;
|
result += qr * qr;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
|
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -256,7 +274,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
|
|||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
float maxDistance,
|
float maxDistance,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global int *bestTrainIdx,
|
__global int *bestTrainIdx,
|
||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__global int *nMatches,
|
__global int *nMatches,
|
||||||
@ -271,7 +289,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
|
|||||||
int step,
|
int step,
|
||||||
int ostep,
|
int ostep,
|
||||||
int distType
|
int distType
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
@ -285,6 +303,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
|
|||||||
__local float *s_train = sharebuffer + block_size * block_size;
|
__local float *s_train = sharebuffer + block_size * block_size;
|
||||||
|
|
||||||
float result = 0;
|
float result = 0;
|
||||||
|
|
||||||
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
|
for (int i = 0 ; i < max_desc_len / block_size ; ++i)
|
||||||
{
|
{
|
||||||
//load a block_size * block_size block into local train.
|
//load a block_size * block_size block into local train.
|
||||||
@ -299,26 +318,32 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
|
|||||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||||
|
|
||||||
switch(distType)
|
switch (distType)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; ++j)
|
for (int j = 0 ; j < block_size ; ++j)
|
||||||
{
|
{
|
||||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; ++j)
|
for (int j = 0 ; j < block_size ; ++j)
|
||||||
{
|
{
|
||||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||||
result += qr * qr;
|
result += qr * qr;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; ++j)
|
for (int j = 0 ; j < block_size ; ++j)
|
||||||
{
|
{
|
||||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -329,7 +354,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
|
|||||||
{
|
{
|
||||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||||
|
|
||||||
if(ind < bestTrainIdx_cols)
|
if (ind < bestTrainIdx_cols)
|
||||||
{
|
{
|
||||||
//bestImgIdx = imgIdx;
|
//bestImgIdx = imgIdx;
|
||||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||||
@ -343,7 +368,7 @@ __kernel void BruteForceMatch_RadiusMatch(
|
|||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
float maxDistance,
|
float maxDistance,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global int *bestTrainIdx,
|
__global int *bestTrainIdx,
|
||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
__global int *nMatches,
|
__global int *nMatches,
|
||||||
@ -357,7 +382,7 @@ __kernel void BruteForceMatch_RadiusMatch(
|
|||||||
int step,
|
int step,
|
||||||
int ostep,
|
int ostep,
|
||||||
int distType
|
int distType
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
@ -371,6 +396,7 @@ __kernel void BruteForceMatch_RadiusMatch(
|
|||||||
__local float *s_train = sharebuffer + block_size * block_size;
|
__local float *s_train = sharebuffer + block_size * block_size;
|
||||||
|
|
||||||
float result = 0;
|
float result = 0;
|
||||||
|
|
||||||
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
|
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
|
||||||
{
|
{
|
||||||
//load a block_size * block_size block into local train.
|
//load a block_size * block_size block into local train.
|
||||||
@ -385,26 +411,32 @@ __kernel void BruteForceMatch_RadiusMatch(
|
|||||||
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
/* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||||
|
|
||||||
switch(distType)
|
switch (distType)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; ++j)
|
for (int j = 0 ; j < block_size ; ++j)
|
||||||
{
|
{
|
||||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; ++j)
|
for (int j = 0 ; j < block_size ; ++j)
|
||||||
{
|
{
|
||||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||||
result += qr * qr;
|
result += qr * qr;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; ++j)
|
for (int j = 0 ; j < block_size ; ++j)
|
||||||
{
|
{
|
||||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -415,7 +447,7 @@ __kernel void BruteForceMatch_RadiusMatch(
|
|||||||
{
|
{
|
||||||
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
|
||||||
|
|
||||||
if(ind < bestTrainIdx_cols)
|
if (ind < bestTrainIdx_cols)
|
||||||
{
|
{
|
||||||
//bestImgIdx = imgIdx;
|
//bestImgIdx = imgIdx;
|
||||||
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
|
||||||
@ -428,7 +460,7 @@ __kernel void BruteForceMatch_RadiusMatch(
|
|||||||
__kernel void BruteForceMatch_knnUnrollMatch(
|
__kernel void BruteForceMatch_knnUnrollMatch(
|
||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global int2 *bestTrainIdx,
|
__global int2 *bestTrainIdx,
|
||||||
__global float2 *bestDistance,
|
__global float2 *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
@ -440,7 +472,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
|||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step,
|
||||||
int distType
|
int distType
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
@ -464,9 +496,11 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
|||||||
|
|
||||||
//loopUnrolledCached
|
//loopUnrolledCached
|
||||||
volatile int imgIdx = 0;
|
volatile int imgIdx = 0;
|
||||||
|
|
||||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||||
{
|
{
|
||||||
float result = 0;
|
float result = 0;
|
||||||
|
|
||||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||||
{
|
{
|
||||||
const int loadX = lidx + i * block_size;
|
const int loadX = lidx + i * block_size;
|
||||||
@ -480,27 +514,33 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
|||||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||||
|
|
||||||
switch(distType)
|
switch (distType)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
|
||||||
result += qr * qr;
|
result += qr * qr;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
//result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||||
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -549,6 +589,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
|||||||
for (int i = 0 ; i < block_size ; i++)
|
for (int i = 0 ; i < block_size ; i++)
|
||||||
{
|
{
|
||||||
float val = s_distance[i];
|
float val = s_distance[i];
|
||||||
|
|
||||||
if (val < bestDistance1)
|
if (val < bestDistance1)
|
||||||
{
|
{
|
||||||
bestDistance2 = bestDistance1;
|
bestDistance2 = bestDistance1;
|
||||||
@ -602,7 +643,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
|
|||||||
__kernel void BruteForceMatch_knnMatch(
|
__kernel void BruteForceMatch_knnMatch(
|
||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global int2 *bestTrainIdx,
|
__global int2 *bestTrainIdx,
|
||||||
__global float2 *bestDistance,
|
__global float2 *bestDistance,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
@ -613,7 +654,7 @@ __kernel void BruteForceMatch_knnMatch(
|
|||||||
int train_cols,
|
int train_cols,
|
||||||
int step,
|
int step,
|
||||||
int distType
|
int distType
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
const int lidx = get_local_id(0);
|
const int lidx = get_local_id(0);
|
||||||
const int lidy = get_local_id(1);
|
const int lidy = get_local_id(1);
|
||||||
@ -632,7 +673,8 @@ __kernel void BruteForceMatch_knnMatch(
|
|||||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||||
{
|
{
|
||||||
float result = 0.0f;
|
float result = 0.0f;
|
||||||
for (int i = 0 ; i < (query_cols + block_size -1) / block_size ; i++)
|
|
||||||
|
for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
|
||||||
{
|
{
|
||||||
const int loadx = lidx + i * block_size;
|
const int loadx = lidx + i * block_size;
|
||||||
//load query and train into local memory
|
//load query and train into local memory
|
||||||
@ -650,27 +692,33 @@ __kernel void BruteForceMatch_knnMatch(
|
|||||||
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
/* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
|
||||||
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
|
||||||
|
|
||||||
switch(distType)
|
switch (distType)
|
||||||
{
|
{
|
||||||
case 0:
|
case 0:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
|
||||||
result += qr * qr;
|
result += qr * qr;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
|
||||||
for (int j = 0 ; j < block_size ; j++)
|
for (int j = 0 ; j < block_size ; j++)
|
||||||
{
|
{
|
||||||
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
//result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
|
||||||
result += bit1Count((uint)s_query[lidy * block_size + j] ^ (uint)s_train[(uint)j * block_size + lidx]);
|
result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -719,6 +767,7 @@ __kernel void BruteForceMatch_knnMatch(
|
|||||||
for (int i = 0 ; i < block_size ; i++)
|
for (int i = 0 ; i < block_size ; i++)
|
||||||
{
|
{
|
||||||
float val = s_distance[i];
|
float val = s_distance[i];
|
||||||
|
|
||||||
if (val < bestDistance1)
|
if (val < bestDistance1)
|
||||||
{
|
{
|
||||||
bestDistance2 = bestDistance1;
|
bestDistance2 = bestDistance1;
|
||||||
@ -772,7 +821,7 @@ __kernel void BruteForceMatch_knnMatch(
|
|||||||
kernel void BruteForceMatch_calcDistanceUnrolled(
|
kernel void BruteForceMatch_calcDistanceUnrolled(
|
||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global float *allDist,
|
__global float *allDist,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
int block_size,
|
||||||
@ -790,7 +839,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
|
|||||||
kernel void BruteForceMatch_calcDistance(
|
kernel void BruteForceMatch_calcDistance(
|
||||||
__global float *query,
|
__global float *query,
|
||||||
__global float *train,
|
__global float *train,
|
||||||
__global float *mask,
|
//__global float *mask,
|
||||||
__global float *allDist,
|
__global float *allDist,
|
||||||
__local float *sharebuffer,
|
__local float *sharebuffer,
|
||||||
int block_size,
|
int block_size,
|
||||||
@ -810,7 +859,7 @@ kernel void BruteForceMatch_findBestMatch(
|
|||||||
__global float *bestDistance,
|
__global float *bestDistance,
|
||||||
int k,
|
int k,
|
||||||
int block_size
|
int block_size
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
/* Todo */
|
/* Todo */
|
||||||
}
|
}
|
Loading…
x
Reference in New Issue
Block a user