HOST side optimization for GFFT

This commit is contained in:
krodyush 2013-12-17 14:02:57 +04:00
parent 5d5527d03e
commit a63576e76d
3 changed files with 241 additions and 323 deletions

View File

@ -1381,8 +1381,10 @@ namespace cv
oclMat Dx_;
oclMat Dy_;
oclMat eig_;
oclMat eig_minmax_;
oclMat minMaxbuf_;
oclMat tmpCorners_;
oclMat counter_;
};
inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,

View File

@ -48,63 +48,35 @@
using namespace cv;
using namespace cv::ocl;
// currently sort procedure on the host is more efficient
static bool use_cpu_sorter = true;
namespace
// compact structure for corners
struct DefCorner
{
enum SortMethod
{
CPU_STL,
BITONIC,
SELECTION
float eig; //eigenvalue of corner
short x; //x coordinate of corner point
short y; //y coordinate of corner point
} ;
const int GROUP_SIZE = 256;
template<SortMethod method>
struct Sorter
// compare procedure for corner
//it is used for sort on the host side
struct DefCornerCompare
{
//typedef EigType;
};
//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
template<>
struct Sorter<CPU_STL>
bool operator()(const DefCorner a, const DefCorner b) const
{
typedef oclMat EigType;
static cv::Mutex cs;
static Mat mat_eig;
//prototype
static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
{
float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
return v1 > v2;
}
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
cv::AutoLock lock(cs);
//temporarily use STL's sort function
Mat mat_corners = corners;
mat_eig = eig_tex;
std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
corners = mat_corners;
return a.eig > b.eig;
}
};
cv::Mutex Sorter<CPU_STL>::cs;
cv::Mat Sorter<CPU_STL>::mat_eig;
template<>
struct Sorter<BITONIC>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
// sort corner point using opencl bitonicosrt implementation
static void sortCorners_caller(oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count / 2, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
int GS = count/2;
int LS = min(255,GS);
size_t globalThreads[3] = {GS, 1, 1};
size_t localThreads[3] = {LS, 1, 1};
// 2^numStages should be equal to count or the output is invalid
int numStages = 0;
@ -112,90 +84,106 @@ struct Sorter<BITONIC>
{
++numStages;
}
const int argc = 5;
const int argc = 4;
std::vector< std::pair<size_t, const void *> > args(argc);
std::string kernelname = "sortCorners_bitonicSort";
args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
for(int stage = 0; stage < numStages; ++stage)
{
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
{
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
}
}
}
};
template<>
struct Sorter<SELECTION>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
std::vector< std::pair<size_t, const void *> > args;
//local
std::string kernelname = "sortCorners_selectionSortLocal";
int lds_size = GROUP_SIZE * sizeof(cl_float2);
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
args.push_back( std::make_pair( lds_size, (void*)NULL) );
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
//final
kernelname = "sortCorners_selectionSortFinal";
args.pop_back();
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
}
};
int findCorners_caller(
const TextureCL& eig,
const float threshold,
// find corners on matrix and put it into array
void findCorners_caller(
const oclMat& eig_mat, //input matrix worth eigenvalues
oclMat& eigMinMax, //input with min and max values of eigenvalues
const float qualityLevel,
const oclMat& mask,
oclMat& corners,
const int max_count)
oclMat& corners, //output array with detected corners
oclMat& counter) //output value with number of detected corners, have to be 0 before call
{
string opt;
std::vector<int> k;
Context * cxt = Context::getContext();
std::vector< std::pair<size_t, const void*> > args;
std::string kernelname = "findCorners";
const int mask_strip = mask.step / mask.elemSize1();
oclMat g_counter(1, 1, CV_32SC1);
g_counter.setTo(0);
args.push_back(make_pair( sizeof(cl_mem), (void*)&(eig_mat.data)));
args.push_back(make_pair( sizeof(cl_mem), (void*)&eig ));
int src_pitch = (int)eig_mat.step;
args.push_back(make_pair( sizeof(cl_int), (void*)&src_pitch ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&mask.data ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&corners.data ));
args.push_back(make_pair( sizeof(cl_int), (void*)&mask_strip));
args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&eigMinMax.data ));
args.push_back(make_pair( sizeof(cl_float), (void*)&qualityLevel ));
args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.rows ));
args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.cols ));
args.push_back(make_pair( sizeof(cl_int), (void*)&corners.cols ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&counter.data ));
size_t globalThreads[3] = {eig.cols, eig.rows, 1};
size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
size_t localThreads[3] = {16, 16, 1};
if(!mask.empty())
opt += " -D WITH_MASK=1";
const char * opt = mask.empty() ? "" : "-D WITH_MASK";
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
return std::min(Mat(g_counter).at<int>(0), max_count);
openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
}
static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
{
size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
CV_Assert(groupnum != 0);
int dbsize = groupnum * 2 * src.elemSize();
ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
int all_cols = src.step / src.elemSize();
int pre_cols = (src.offset % src.step) / src.elemSize();
int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
int invalid_cols = pre_cols + sec_cols;
int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
int offset = src.offset / src.elemSize();
{// first parallel pass
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
size_t globalThreads[3] = {groupnum * 256, 1, 1};
size_t localThreads[3] = {256, 1, 1};
openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
args, -1, -1, "-D T=float -D DEPTH_5");
}
{// run final "serial" kernel to find accumulate results from threads and reset corner counter
vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
size_t globalThreads[3] = {1, 1, 1};
size_t localThreads[3] = {1, 1, 1};
openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
args, -1, -1);
}
}
}//unnamed namespace
void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
{
@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
ensureSizeIsEnough(image.size(), CV_32F, eig_);
if (useHarrisDetector)
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
else
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
double maxVal = 0;
minMax(eig_, NULL, &maxVal);
ensureSizeIsEnough(1,1, CV_32SC1, counter_);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
// find max eigenvalue and reset detected counters
minMaxEig_caller(eig_,eig_minmax_,counter_);
Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
int total = findCorners_caller(
*eig_tex,
static_cast<float>(maxVal * qualityLevel),
// allocate buffer for kernels
int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
if(!use_cpu_sorter)
{ // round to 2^n
unsigned int n=1;
for(n=1;n<(unsigned int)corner_array_size;n<<=1);
corner_array_size = (int)n;
ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
// set to 0 to be able use bitonic sort on whole 2^n array
tmpCorners_.setTo(0);
}
else
{
ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
}
int total = tmpCorners_.cols; // by default the number of corner is full array
vector<DefCorner> tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
//find points with high eigenvalue and put it into the output array
findCorners_caller(
eig_,
eig_minmax_,
static_cast<float>(qualityLevel),
mask,
tmpCorners_,
tmpCorners_.cols);
counter_);
if(!use_cpu_sorter)
{// sort detected corners on deivce side
sortCorners_caller(tmpCorners_, corner_array_size);
}
else
{// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
}
//blocking read whole corners array (sorted or not sorted)
openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));
if (total == 0)
{
{// check for trivial case
corners.release();
return;
}
if(use_cpu_sorter)
{
Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
}
else
{
//if total is power of 2
if(((total - 1) & (total)) == 0)
{
Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
else
{
Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
{// sort detected corners on cpu side.
tmp.resize(total);
cv::sort(tmp,DefCornerCompare());
}
//estimate maximal size of final output array
int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
int D2 = (int)ceil(minDistance * minDistance);
// allocate output buffer
vector<Point2f> tmp2;
tmp2.reserve(total_max);
if (minDistance < 1)
{// we have not distance restriction. then just copy with conversion maximal allowed points into output array
for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
{
Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
tmpCorners_(roi_range).copyTo(corners);
tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
}
}
else
{
vector<Point2f> tmp(total);
downloadPoints(tmpCorners_, tmp);
vector<Point2f> tmp2;
tmp2.reserve(total);
{// we have distance restriction. then start coping to output array from the first element and check distance for each next one
const int cell_size = cvRound(minDistance);
const int grid_width = (image.cols + cell_size - 1) / cell_size;
const int grid_height = (image.rows + cell_size - 1) / cell_size;
std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
std::vector< std::vector<Point2i> > grid(grid_width * grid_height);
for (int i = 0; i < total ; ++i)
{
Point2f p = tmp[i];
DefCorner p = tmp[i];
if(p.eig<=0.0f)
break; // condition to stop that is needed for GPU bitonic sort usage.
bool good = true;
@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
{
for (int xx = x1; xx <= x2; xx++)
{
vector<Point2f>& m = grid[yy * grid_width + xx];
if (!m.empty())
{
vector<Point2i>& m = grid[yy * grid_width + xx];
if (m.empty())
continue;
for(size_t j = 0; j < m.size(); j++)
{
float dx = p.x - m[j].x;
float dy = p.y - m[j].y;
int dx = p.x - m[j].x;
int dy = p.y - m[j].y;
if (dx * dx + dy * dy < minDistance * minDistance)
if (dx * dx + dy * dy < D2)
{
good = false;
goto break_out;
}
goto break_out_;
}
}
}
}
break_out:
break_out_:
if(good)
{
grid[y_cell * grid_width + x_cell].push_back(p);
grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));
tmp2.push_back(p);
tmp2.push_back(Point2f(p.x,p.y));
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
break;
}
}
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
}
int final_size = static_cast<int>(tmp2.size());
if(final_size>0)
corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
else
corners.release();
}
void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
{

View File

@ -46,33 +46,26 @@
#ifndef WITH_MASK
#define WITH_MASK 0
#endif
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
{
return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
}
inline float ELEM_FLT2(image2d_t _eig, float2 pt)
{
return read_imagef(_eig, sampler, pt).x;
}
//macro to read eigenvalue matrix
#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x]
__kernel
void findCorners
(
image2d_t eig,
__global const char* eig,
const int eig_pitch,
__global const char* mask,
__global float2* corners,
const int mask_strip,// in pixels
const float threshold,
__global const float* pMinMax,
const float qualityLevel,
const int rows,
const int cols,
const int max_count,
__global int* g_counter
)
{
float threshold = qualityLevel*pMinMax[1];
const int j = get_global_id(0);
const int i = get_global_id(1);
@ -82,39 +75,42 @@ __kernel
#endif
)
{
const float val = ELEM_INT2(eig, j, i);
const float val = GET_SRC_32F(j, i);
if (val > threshold)
{
float maxVal = val;
maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal);
maxVal = fmax(GET_SRC_32F(j , i - 1), maxVal);
maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal);
maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal);
maxVal = fmax(GET_SRC_32F(j , i + 1), maxVal);
maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = atomic_inc(g_counter);
if (ind < max_count)
corners[ind] = (float2)(j, i);
{// pack and store eigenvalue and its coordinates
corners[ind].x = val;
corners[ind].y = as_float(j|(i<<16));
}
}
}
}
}
#undef GET_SRC_32F
//bitonic sort
__kernel
void sortCorners_bitonicSort
(
image2d_t eig,
__global float2 * corners,
const int count,
const int stage,
@ -140,8 +136,8 @@ __kernel
const float2 leftPt = corners[leftId];
const float2 rightPt = corners[rightId];
const float leftVal = ELEM_FLT2(eig, leftPt);
const float rightVal = ELEM_FLT2(eig, rightPt);
const float leftVal = leftPt.x;
const float rightVal = rightPt.x;
const bool compareResult = leftVal > rightVal;
@ -152,124 +148,22 @@ __kernel
corners[rightId] = sortOrder ? greater : lesser;
}
//selection sort for gfft
//kernel is ported from Bolt library:
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
// Local sort will firstly sort elements of each workgroup using selection sort
// its performance is O(n)
__kernel
void sortCorners_selectionSortLocal
(
image2d_t eig,
__global float2 * corners,
const int count,
__local float2 * scratch
)
// this is simple short serial kernel that makes some short reduction and initialization work
// it makes HOST like work to avoid additional sync with HOST to do this short work
// data - input/output float2.
// input data are sevral (min,max) pairs
// output data is one reduced (min,max) pair
// g_counter - counter that have to be initialized by 0 for next findCorner call.
__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter)
{
int i = get_local_id(0); // index in workgroup
int numOfGroups = get_num_groups(0); // index in workgroup
int groupID = get_group_id(0);
int wg = get_local_size(0); // workgroup size = block size
int n; // number of elements to be processed for this work group
int offset = groupID * wg;
int same = 0;
corners += offset;
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
float2 pt1, pt2;
pt1 = corners[min(i, n)];
scratch[i] = pt1;
barrier(CLK_LOCAL_MEM_FENCE);
if(i >= n)
g_counter[0] = 0;
float minVal = data[0];
float maxVal = data[groupnum];
for(int i=1;i<groupnum;++i)
{
return;
minVal = min(minVal,data[i]);
maxVal = max(maxVal,data[i+groupnum]);
}
float val1 = ELEM_FLT2(eig, pt1);
float val2;
int pos = 0;
for (int j=0;j<n;++j)
{
pt2 = scratch[j];
val2 = ELEM_FLT2(eig, pt2);
if(val2 > val1)
pos++;//calculate the rank of this element in this work group
else
{
if(val1 > val2)
continue;
else
{
// val1 and val2 are same
same++;
}
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}
__kernel
void sortCorners_selectionSortFinal
(
image2d_t eig,
__global float2 * corners,
const int count
)
{
const int i = get_local_id(0); // index in workgroup
const int numOfGroups = get_num_groups(0); // index in workgroup
const int groupID = get_group_id(0);
const int wg = get_local_size(0); // workgroup size = block size
int pos = 0, same = 0;
const int offset = get_group_id(0) * wg;
const int remainder = count - wg*(numOfGroups-1);
if((offset + i ) >= count)
return;
float2 pt1, pt2;
pt1 = corners[groupID*wg + i];
float val1 = ELEM_FLT2(eig, pt1);
float val2;
for(int j=0; j<numOfGroups-1; j++ )
{
for(int k=0; k<wg; k++)
{
pt2 = corners[j*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Increment only if the value is not the same.
if( val2 > val1 )
pos++;
else
same++;
}
}
}
for(int k=0; k<remainder; k++)
{
pt2 = corners[(numOfGroups-1)*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Don't increment if the value is the same.
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
if(val2 > val1)
pos++;
else
same++;
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
data[0] = minVal;
data[1] = maxVal;
}