Optimize bfmatcher by passing macros.
This commit is contained in:
parent
1e49c00f4b
commit
113b7584e0
@ -74,6 +74,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
|
||||
int m_size = MAX_DESC_LEN;
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
static const int OPT_SIZE = 40;
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@ -82,8 +85,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@ -93,7 +94,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
|
||||
|
||||
std::string kernelName = "BruteForceMatch_UnrollMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
|
||||
}
|
||||
}
|
||||
|
||||
@ -115,6 +116,9 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
int block_size = BLOCK_SIZE;
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
static const int OPT_SIZE = 40;
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt, "-D block_size=%d", block_size);
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@ -123,7 +127,6 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@ -133,7 +136,7 @@ void match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
|
||||
std::string kernelName = "BruteForceMatch_Match";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
|
||||
}
|
||||
}
|
||||
|
||||
@ -157,6 +160,9 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
||||
int m_size = MAX_DESC_LEN;
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
static const int OPT_SIZE = 40;
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@ -167,8 +173,6 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@ -180,7 +184,7 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, float maxDist
|
||||
|
||||
std::string kernelName = "BruteForceMatch_RadiusUnrollMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
|
||||
}
|
||||
}
|
||||
|
||||
@ -197,6 +201,9 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
|
||||
int block_size = BLOCK_SIZE;
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
static const int OPT_SIZE = 40;
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt, "-D block_size=%d", block_size);
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@ -207,7 +214,6 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&nMatches.data ));
|
||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@ -219,7 +225,7 @@ void radius_match(const oclMat &query, const oclMat &train, float maxDistance, c
|
||||
|
||||
std::string kernelName = "BruteForceMatch_RadiusMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
|
||||
}
|
||||
}
|
||||
|
||||
@ -294,6 +300,9 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
|
||||
int m_size = MAX_DESC_LEN;
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
static const int OPT_SIZE = 40;
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt, "-D block_size=%d -D max_desc_len=%d", block_size, m_size);
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@ -302,8 +311,6 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&m_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@ -313,7 +320,7 @@ void knn_matchUnrolledCached(const oclMat &query, const oclMat &train, const ocl
|
||||
|
||||
std::string kernelName = "BruteForceMatch_knnUnrollMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
|
||||
}
|
||||
}
|
||||
|
||||
@ -328,6 +335,9 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
int block_size = BLOCK_SIZE;
|
||||
vector< pair<size_t, const void *> > args;
|
||||
|
||||
static const int OPT_SIZE = 40;
|
||||
char opt [OPT_SIZE] = "";
|
||||
sprintf(opt, "-D block_size=%d", block_size);
|
||||
if(globalSize[0] != 0)
|
||||
{
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&query.data ));
|
||||
@ -336,7 +346,6 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&trainIdx.data ));
|
||||
args.push_back( make_pair( sizeof(cl_mem), (void *)&distance.data ));
|
||||
args.push_back( make_pair( smemSize, (void *)NULL));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&block_size ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.rows ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&query.cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int), (void *)&train.rows ));
|
||||
@ -346,7 +355,7 @@ void knn_match(const oclMat &query, const oclMat &train, const oclMat &/*mask*/,
|
||||
|
||||
std::string kernelName = "BruteForceMatch_knnMatch";
|
||||
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth());
|
||||
openCLExecuteKernel(ctx, &brute_force_match, kernelName, globalSize, localSize, args, -1, query.depth(), opt);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,58 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Nathan, liujun@multicorewareinc.com
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
|
||||
#define MAX_FLOAT 1e7f
|
||||
#define MAX_FLOAT 3.40282e+038f
|
||||
|
||||
#ifndef block_size
|
||||
#define block_size 16
|
||||
#endif
|
||||
#ifndef max_desc_len
|
||||
#define max_desc_len 64
|
||||
#endif
|
||||
|
||||
int bit1Count(float x)
|
||||
{
|
||||
@ -15,7 +68,6 @@ int bit1Count(float x)
|
||||
|
||||
float reduce_block(__local float *s_query,
|
||||
__local float *s_train,
|
||||
int block_size,
|
||||
int lidx,
|
||||
int lidy,
|
||||
int distType
|
||||
@ -51,8 +103,6 @@ float reduce_block(__local float *s_query,
|
||||
|
||||
float reduce_multi_block(__local float *s_query,
|
||||
__local float *s_train,
|
||||
int max_desc_len,
|
||||
int block_size,
|
||||
int block_index,
|
||||
int lidx,
|
||||
int lidy,
|
||||
@ -98,8 +148,6 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -108,6 +156,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
int distType
|
||||
)
|
||||
{
|
||||
|
||||
const int lidx = get_local_id(0);
|
||||
const int lidy = get_local_id(1);
|
||||
const int groupidx = get_group_id(0);
|
||||
@ -117,6 +166,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
|
||||
int queryIdx = groupidx * block_size + lidy;
|
||||
// load the query into local memory.
|
||||
#pragma unroll
|
||||
for (int i = 0 ; i < max_desc_len / block_size; i ++)
|
||||
{
|
||||
int loadx = lidx + i * block_size;
|
||||
@ -128,9 +178,10 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
|
||||
// loopUnrolledCached to find the best trainIdx and best distance.
|
||||
volatile int imgIdx = 0;
|
||||
for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
|
||||
for (int t = 0, endt = (train_rows + block_size - 1) / block_size; t < endt; t++)
|
||||
{
|
||||
float result = 0;
|
||||
#pragma unroll
|
||||
for (int i = 0 ; i < max_desc_len / block_size ; i++)
|
||||
{
|
||||
//load a block_size * block_size block into local train.
|
||||
@ -140,7 +191,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
|
||||
result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@ -168,6 +219,7 @@ __kernel void BruteForceMatch_UnrollMatch_D5(
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
//reduce -- now all reduce implement in each threads.
|
||||
#pragma unroll
|
||||
for (int k = 0 ; k < block_size; k++)
|
||||
{
|
||||
if (myBestDistance > s_distance[k])
|
||||
@ -191,7 +243,6 @@ __kernel void BruteForceMatch_Match_D5(
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -232,7 +283,7 @@ __kernel void BruteForceMatch_Match_D5(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
||||
result += reduce_block(s_query, s_train, lidx, lidy, distType);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@ -287,8 +338,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -322,7 +371,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch_D5(
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
||||
result += reduce_block(s_query, s_train, lidx, lidy, distType);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@ -350,7 +399,6 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
|
||||
__global float *bestDistance,
|
||||
__global int *nMatches,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -384,7 +432,7 @@ __kernel void BruteForceMatch_RadiusMatch_D5(
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
||||
result += reduce_block(s_query, s_train, lidx, lidy, distType);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@ -410,8 +458,6 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -455,7 +501,7 @@ __kernel void BruteForceMatch_knnUnrollMatch_D5(
|
||||
//synchronize to make sure each elem for reduceIteration in share memory is written already.
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_multi_block(s_query, s_train, max_desc_len, block_size, i, lidx, lidy, distType);
|
||||
result += reduce_multi_block(s_query, s_train, i, lidx, lidy, distType);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@ -559,7 +605,6 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
||||
__global int2 *bestTrainIdx,
|
||||
__global float2 *bestDistance,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -600,7 +645,7 @@ __kernel void BruteForceMatch_knnMatch_D5(
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
result += reduce_block(s_query, s_train, block_size, lidx, lidy, distType);
|
||||
result += reduce_block(s_query, s_train, lidx, lidy, distType);
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
@ -703,8 +748,6 @@ kernel void BruteForceMatch_calcDistanceUnrolled_D5(
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int max_desc_len,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -721,7 +764,6 @@ kernel void BruteForceMatch_calcDistance_D5(
|
||||
//__global float *mask,
|
||||
__global float *allDist,
|
||||
__local float *sharebuffer,
|
||||
int block_size,
|
||||
int query_rows,
|
||||
int query_cols,
|
||||
int train_rows,
|
||||
@ -736,8 +778,7 @@ kernel void BruteForceMatch_findBestMatch_D5(
|
||||
__global float *allDist,
|
||||
__global int *bestTrainIdx,
|
||||
__global float *bestDistance,
|
||||
int k,
|
||||
int block_size
|
||||
int k
|
||||
)
|
||||
{
|
||||
/* Todo */
|
||||
|
Loading…
x
Reference in New Issue
Block a user