HOST side optimization for GFFT
This commit is contained in:
parent
5d5527d03e
commit
a63576e76d
@ -1381,8 +1381,10 @@ namespace cv
|
||||
oclMat Dx_;
|
||||
oclMat Dy_;
|
||||
oclMat eig_;
|
||||
oclMat eig_minmax_;
|
||||
oclMat minMaxbuf_;
|
||||
oclMat tmpCorners_;
|
||||
oclMat counter_;
|
||||
};
|
||||
|
||||
inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
|
||||
|
@ -48,63 +48,35 @@
|
||||
using namespace cv;
|
||||
using namespace cv::ocl;
|
||||
|
||||
// currently sort procedure on the host is more efficient
|
||||
static bool use_cpu_sorter = true;
|
||||
|
||||
namespace
|
||||
// compact structure for corners
|
||||
struct DefCorner
|
||||
{
|
||||
enum SortMethod
|
||||
{
|
||||
CPU_STL,
|
||||
BITONIC,
|
||||
SELECTION
|
||||
float eig; //eigenvalue of corner
|
||||
short x; //x coordinate of corner point
|
||||
short y; //y coordinate of corner point
|
||||
} ;
|
||||
|
||||
const int GROUP_SIZE = 256;
|
||||
|
||||
template<SortMethod method>
|
||||
struct Sorter
|
||||
// compare procedure for corner
|
||||
//it is used for sort on the host side
|
||||
struct DefCornerCompare
|
||||
{
|
||||
//typedef EigType;
|
||||
};
|
||||
|
||||
//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
|
||||
template<>
|
||||
struct Sorter<CPU_STL>
|
||||
bool operator()(const DefCorner a, const DefCorner b) const
|
||||
{
|
||||
typedef oclMat EigType;
|
||||
static cv::Mutex cs;
|
||||
static Mat mat_eig;
|
||||
|
||||
//prototype
|
||||
static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
|
||||
{
|
||||
float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
|
||||
float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
|
||||
return v1 > v2;
|
||||
}
|
||||
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
|
||||
{
|
||||
cv::AutoLock lock(cs);
|
||||
//temporarily use STL's sort function
|
||||
Mat mat_corners = corners;
|
||||
mat_eig = eig_tex;
|
||||
std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
|
||||
corners = mat_corners;
|
||||
return a.eig > b.eig;
|
||||
}
|
||||
};
|
||||
cv::Mutex Sorter<CPU_STL>::cs;
|
||||
cv::Mat Sorter<CPU_STL>::mat_eig;
|
||||
|
||||
template<>
|
||||
struct Sorter<BITONIC>
|
||||
{
|
||||
typedef TextureCL EigType;
|
||||
|
||||
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
|
||||
// sort corner point using opencl bitonicosrt implementation
|
||||
static void sortCorners_caller(oclMat& corners, const int count)
|
||||
{
|
||||
Context * cxt = Context::getContext();
|
||||
size_t globalThreads[3] = {count / 2, 1, 1};
|
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
|
||||
int GS = count/2;
|
||||
int LS = min(255,GS);
|
||||
size_t globalThreads[3] = {GS, 1, 1};
|
||||
size_t localThreads[3] = {LS, 1, 1};
|
||||
|
||||
// 2^numStages should be equal to count or the output is invalid
|
||||
int numStages = 0;
|
||||
@ -112,90 +84,106 @@ struct Sorter<BITONIC>
|
||||
{
|
||||
++numStages;
|
||||
}
|
||||
const int argc = 5;
|
||||
const int argc = 4;
|
||||
std::vector< std::pair<size_t, const void *> > args(argc);
|
||||
std::string kernelname = "sortCorners_bitonicSort";
|
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
|
||||
args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
|
||||
args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
|
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
|
||||
args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
|
||||
for(int stage = 0; stage < numStages; ++stage)
|
||||
{
|
||||
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
|
||||
args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
|
||||
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
|
||||
{
|
||||
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
|
||||
args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
|
||||
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Sorter<SELECTION>
|
||||
{
|
||||
typedef TextureCL EigType;
|
||||
|
||||
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
|
||||
{
|
||||
Context * cxt = Context::getContext();
|
||||
|
||||
size_t globalThreads[3] = {count, 1, 1};
|
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
//local
|
||||
std::string kernelname = "sortCorners_selectionSortLocal";
|
||||
int lds_size = GROUP_SIZE * sizeof(cl_float2);
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
|
||||
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
|
||||
args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
|
||||
args.push_back( std::make_pair( lds_size, (void*)NULL) );
|
||||
|
||||
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
|
||||
|
||||
//final
|
||||
kernelname = "sortCorners_selectionSortFinal";
|
||||
args.pop_back();
|
||||
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
|
||||
}
|
||||
};
|
||||
|
||||
int findCorners_caller(
|
||||
const TextureCL& eig,
|
||||
const float threshold,
|
||||
// find corners on matrix and put it into array
|
||||
void findCorners_caller(
|
||||
const oclMat& eig_mat, //input matrix worth eigenvalues
|
||||
oclMat& eigMinMax, //input with min and max values of eigenvalues
|
||||
const float qualityLevel,
|
||||
const oclMat& mask,
|
||||
oclMat& corners,
|
||||
const int max_count)
|
||||
oclMat& corners, //output array with detected corners
|
||||
oclMat& counter) //output value with number of detected corners, have to be 0 before call
|
||||
{
|
||||
string opt;
|
||||
std::vector<int> k;
|
||||
Context * cxt = Context::getContext();
|
||||
|
||||
std::vector< std::pair<size_t, const void*> > args;
|
||||
std::string kernelname = "findCorners";
|
||||
|
||||
const int mask_strip = mask.step / mask.elemSize1();
|
||||
|
||||
oclMat g_counter(1, 1, CV_32SC1);
|
||||
g_counter.setTo(0);
|
||||
args.push_back(make_pair( sizeof(cl_mem), (void*)&(eig_mat.data)));
|
||||
|
||||
args.push_back(make_pair( sizeof(cl_mem), (void*)&eig ));
|
||||
int src_pitch = (int)eig_mat.step;
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&src_pitch ));
|
||||
args.push_back(make_pair( sizeof(cl_mem), (void*)&mask.data ));
|
||||
args.push_back(make_pair( sizeof(cl_mem), (void*)&corners.data ));
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&mask_strip));
|
||||
args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
|
||||
args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
|
||||
args.push_back(make_pair( sizeof(cl_mem), (void*)&eigMinMax.data ));
|
||||
args.push_back(make_pair( sizeof(cl_float), (void*)&qualityLevel ));
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.rows ));
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.cols ));
|
||||
args.push_back(make_pair( sizeof(cl_int), (void*)&corners.cols ));
|
||||
args.push_back(make_pair( sizeof(cl_mem), (void*)&counter.data ));
|
||||
|
||||
size_t globalThreads[3] = {eig.cols, eig.rows, 1};
|
||||
size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
|
||||
size_t localThreads[3] = {16, 16, 1};
|
||||
if(!mask.empty())
|
||||
opt += " -D WITH_MASK=1";
|
||||
|
||||
const char * opt = mask.empty() ? "" : "-D WITH_MASK";
|
||||
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
|
||||
return std::min(Mat(g_counter).at<int>(0), max_count);
|
||||
openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
|
||||
}
|
||||
|
||||
|
||||
static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
|
||||
{
|
||||
size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
|
||||
CV_Assert(groupnum != 0);
|
||||
|
||||
int dbsize = groupnum * 2 * src.elemSize();
|
||||
|
||||
ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
|
||||
|
||||
cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
|
||||
|
||||
int all_cols = src.step / src.elemSize();
|
||||
int pre_cols = (src.offset % src.step) / src.elemSize();
|
||||
int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
|
||||
int invalid_cols = pre_cols + sec_cols;
|
||||
int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
|
||||
int offset = src.offset / src.elemSize();
|
||||
|
||||
{// first parallel pass
|
||||
vector<pair<size_t , const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
|
||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
|
||||
size_t globalThreads[3] = {groupnum * 256, 1, 1};
|
||||
size_t localThreads[3] = {256, 1, 1};
|
||||
openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
|
||||
args, -1, -1, "-D T=float -D DEPTH_5");
|
||||
}
|
||||
|
||||
{// run final "serial" kernel to find accumulate results from threads and reset corner counter
|
||||
vector<pair<size_t , const void *> > args;
|
||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
|
||||
args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum ));
|
||||
args.push_back( make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
|
||||
size_t globalThreads[3] = {1, 1, 1};
|
||||
size_t localThreads[3] = {1, 1, 1};
|
||||
openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
|
||||
args, -1, -1);
|
||||
}
|
||||
}
|
||||
}//unnamed namespace
|
||||
|
||||
void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
|
||||
{
|
||||
@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
|
||||
ensureSizeIsEnough(image.size(), CV_32F, eig_);
|
||||
|
||||
if (useHarrisDetector)
|
||||
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
|
||||
cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
|
||||
else
|
||||
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
|
||||
|
||||
double maxVal = 0;
|
||||
minMax(eig_, NULL, &maxVal);
|
||||
ensureSizeIsEnough(1,1, CV_32SC1, counter_);
|
||||
|
||||
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
|
||||
// find max eigenvalue and reset detected counters
|
||||
minMaxEig_caller(eig_,eig_minmax_,counter_);
|
||||
|
||||
Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
|
||||
int total = findCorners_caller(
|
||||
*eig_tex,
|
||||
static_cast<float>(maxVal * qualityLevel),
|
||||
// allocate buffer for kernels
|
||||
int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
|
||||
|
||||
if(!use_cpu_sorter)
|
||||
{ // round to 2^n
|
||||
unsigned int n=1;
|
||||
for(n=1;n<(unsigned int)corner_array_size;n<<=1);
|
||||
corner_array_size = (int)n;
|
||||
|
||||
ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
|
||||
|
||||
// set to 0 to be able use bitonic sort on whole 2^n array
|
||||
tmpCorners_.setTo(0);
|
||||
}
|
||||
else
|
||||
{
|
||||
ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
|
||||
}
|
||||
|
||||
int total = tmpCorners_.cols; // by default the number of corner is full array
|
||||
vector<DefCorner> tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
|
||||
|
||||
//find points with high eigenvalue and put it into the output array
|
||||
findCorners_caller(
|
||||
eig_,
|
||||
eig_minmax_,
|
||||
static_cast<float>(qualityLevel),
|
||||
mask,
|
||||
tmpCorners_,
|
||||
tmpCorners_.cols);
|
||||
counter_);
|
||||
|
||||
if(!use_cpu_sorter)
|
||||
{// sort detected corners on deivce side
|
||||
sortCorners_caller(tmpCorners_, corner_array_size);
|
||||
}
|
||||
else
|
||||
{// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
|
||||
openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
|
||||
}
|
||||
|
||||
//blocking read whole corners array (sorted or not sorted)
|
||||
openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));
|
||||
|
||||
if (total == 0)
|
||||
{
|
||||
{// check for trivial case
|
||||
corners.release();
|
||||
return;
|
||||
}
|
||||
|
||||
if(use_cpu_sorter)
|
||||
{
|
||||
Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
|
||||
}
|
||||
else
|
||||
{
|
||||
//if total is power of 2
|
||||
if(((total - 1) & (total)) == 0)
|
||||
{
|
||||
Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
|
||||
}
|
||||
else
|
||||
{
|
||||
Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
|
||||
}
|
||||
{// sort detected corners on cpu side.
|
||||
tmp.resize(total);
|
||||
cv::sort(tmp,DefCornerCompare());
|
||||
}
|
||||
|
||||
//estimate maximal size of final output array
|
||||
int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
|
||||
int D2 = (int)ceil(minDistance * minDistance);
|
||||
// allocate output buffer
|
||||
vector<Point2f> tmp2;
|
||||
tmp2.reserve(total_max);
|
||||
|
||||
|
||||
if (minDistance < 1)
|
||||
{// we have not distance restriction. then just copy with conversion maximal allowed points into output array
|
||||
for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
|
||||
{
|
||||
Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
|
||||
tmpCorners_(roi_range).copyTo(corners);
|
||||
tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
vector<Point2f> tmp(total);
|
||||
downloadPoints(tmpCorners_, tmp);
|
||||
|
||||
vector<Point2f> tmp2;
|
||||
tmp2.reserve(total);
|
||||
|
||||
{// we have distance restriction. then start coping to output array from the first element and check distance for each next one
|
||||
const int cell_size = cvRound(minDistance);
|
||||
const int grid_width = (image.cols + cell_size - 1) / cell_size;
|
||||
const int grid_height = (image.rows + cell_size - 1) / cell_size;
|
||||
|
||||
std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
|
||||
std::vector< std::vector<Point2i> > grid(grid_width * grid_height);
|
||||
|
||||
for (int i = 0; i < total ; ++i)
|
||||
{
|
||||
Point2f p = tmp[i];
|
||||
DefCorner p = tmp[i];
|
||||
|
||||
if(p.eig<=0.0f)
|
||||
break; // condition to stop that is needed for GPU bitonic sort usage.
|
||||
|
||||
bool good = true;
|
||||
|
||||
@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
|
||||
{
|
||||
for (int xx = x1; xx <= x2; xx++)
|
||||
{
|
||||
vector<Point2f>& m = grid[yy * grid_width + xx];
|
||||
|
||||
if (!m.empty())
|
||||
{
|
||||
vector<Point2i>& m = grid[yy * grid_width + xx];
|
||||
if (m.empty())
|
||||
continue;
|
||||
for(size_t j = 0; j < m.size(); j++)
|
||||
{
|
||||
float dx = p.x - m[j].x;
|
||||
float dy = p.y - m[j].y;
|
||||
int dx = p.x - m[j].x;
|
||||
int dy = p.y - m[j].y;
|
||||
|
||||
if (dx * dx + dy * dy < minDistance * minDistance)
|
||||
if (dx * dx + dy * dy < D2)
|
||||
{
|
||||
good = false;
|
||||
goto break_out;
|
||||
}
|
||||
goto break_out_;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
break_out:
|
||||
break_out_:
|
||||
|
||||
if(good)
|
||||
{
|
||||
grid[y_cell * grid_width + x_cell].push_back(p);
|
||||
grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));
|
||||
|
||||
tmp2.push_back(p);
|
||||
tmp2.push_back(Point2f(p.x,p.y));
|
||||
|
||||
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
|
||||
}
|
||||
int final_size = static_cast<int>(tmp2.size());
|
||||
if(final_size>0)
|
||||
corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
|
||||
else
|
||||
corners.release();
|
||||
}
|
||||
void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
|
||||
{
|
||||
|
@ -46,33 +46,26 @@
|
||||
#ifndef WITH_MASK
|
||||
#define WITH_MASK 0
|
||||
#endif
|
||||
|
||||
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
|
||||
|
||||
inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
|
||||
{
|
||||
return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
|
||||
}
|
||||
|
||||
inline float ELEM_FLT2(image2d_t _eig, float2 pt)
|
||||
{
|
||||
return read_imagef(_eig, sampler, pt).x;
|
||||
}
|
||||
//macro to read eigenvalue matrix
|
||||
#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x]
|
||||
|
||||
__kernel
|
||||
void findCorners
|
||||
(
|
||||
image2d_t eig,
|
||||
__global const char* eig,
|
||||
const int eig_pitch,
|
||||
__global const char* mask,
|
||||
__global float2* corners,
|
||||
const int mask_strip,// in pixels
|
||||
const float threshold,
|
||||
__global const float* pMinMax,
|
||||
const float qualityLevel,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int max_count,
|
||||
__global int* g_counter
|
||||
)
|
||||
{
|
||||
float threshold = qualityLevel*pMinMax[1];
|
||||
const int j = get_global_id(0);
|
||||
const int i = get_global_id(1);
|
||||
|
||||
@ -82,39 +75,42 @@ __kernel
|
||||
#endif
|
||||
)
|
||||
{
|
||||
const float val = ELEM_INT2(eig, j, i);
|
||||
const float val = GET_SRC_32F(j, i);
|
||||
|
||||
if (val > threshold)
|
||||
{
|
||||
float maxVal = val;
|
||||
maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal);
|
||||
maxVal = fmax(GET_SRC_32F(j , i - 1), maxVal);
|
||||
maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal);
|
||||
|
||||
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
|
||||
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
|
||||
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
|
||||
maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal);
|
||||
maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal);
|
||||
|
||||
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
|
||||
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
|
||||
|
||||
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
|
||||
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
|
||||
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
|
||||
maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal);
|
||||
maxVal = fmax(GET_SRC_32F(j , i + 1), maxVal);
|
||||
maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal);
|
||||
|
||||
if (val == maxVal)
|
||||
{
|
||||
const int ind = atomic_inc(g_counter);
|
||||
|
||||
if (ind < max_count)
|
||||
corners[ind] = (float2)(j, i);
|
||||
{// pack and store eigenvalue and its coordinates
|
||||
corners[ind].x = val;
|
||||
corners[ind].y = as_float(j|(i<<16));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#undef GET_SRC_32F
|
||||
|
||||
|
||||
//bitonic sort
|
||||
__kernel
|
||||
void sortCorners_bitonicSort
|
||||
(
|
||||
image2d_t eig,
|
||||
__global float2 * corners,
|
||||
const int count,
|
||||
const int stage,
|
||||
@ -140,8 +136,8 @@ __kernel
|
||||
const float2 leftPt = corners[leftId];
|
||||
const float2 rightPt = corners[rightId];
|
||||
|
||||
const float leftVal = ELEM_FLT2(eig, leftPt);
|
||||
const float rightVal = ELEM_FLT2(eig, rightPt);
|
||||
const float leftVal = leftPt.x;
|
||||
const float rightVal = rightPt.x;
|
||||
|
||||
const bool compareResult = leftVal > rightVal;
|
||||
|
||||
@ -152,124 +148,22 @@ __kernel
|
||||
corners[rightId] = sortOrder ? greater : lesser;
|
||||
}
|
||||
|
||||
//selection sort for gfft
|
||||
//kernel is ported from Bolt library:
|
||||
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
|
||||
// Local sort will firstly sort elements of each workgroup using selection sort
|
||||
// its performance is O(n)
|
||||
__kernel
|
||||
void sortCorners_selectionSortLocal
|
||||
(
|
||||
image2d_t eig,
|
||||
__global float2 * corners,
|
||||
const int count,
|
||||
__local float2 * scratch
|
||||
)
|
||||
// this is simple short serial kernel that makes some short reduction and initialization work
|
||||
// it makes HOST like work to avoid additional sync with HOST to do this short work
|
||||
// data - input/output float2.
|
||||
// input data are sevral (min,max) pairs
|
||||
// output data is one reduced (min,max) pair
|
||||
// g_counter - counter that have to be initialized by 0 for next findCorner call.
|
||||
__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter)
|
||||
{
|
||||
int i = get_local_id(0); // index in workgroup
|
||||
int numOfGroups = get_num_groups(0); // index in workgroup
|
||||
int groupID = get_group_id(0);
|
||||
int wg = get_local_size(0); // workgroup size = block size
|
||||
int n; // number of elements to be processed for this work group
|
||||
|
||||
int offset = groupID * wg;
|
||||
int same = 0;
|
||||
corners += offset;
|
||||
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
|
||||
float2 pt1, pt2;
|
||||
|
||||
pt1 = corners[min(i, n)];
|
||||
scratch[i] = pt1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(i >= n)
|
||||
g_counter[0] = 0;
|
||||
float minVal = data[0];
|
||||
float maxVal = data[groupnum];
|
||||
for(int i=1;i<groupnum;++i)
|
||||
{
|
||||
return;
|
||||
minVal = min(minVal,data[i]);
|
||||
maxVal = max(maxVal,data[i+groupnum]);
|
||||
}
|
||||
|
||||
float val1 = ELEM_FLT2(eig, pt1);
|
||||
float val2;
|
||||
|
||||
int pos = 0;
|
||||
for (int j=0;j<n;++j)
|
||||
{
|
||||
pt2 = scratch[j];
|
||||
val2 = ELEM_FLT2(eig, pt2);
|
||||
if(val2 > val1)
|
||||
pos++;//calculate the rank of this element in this work group
|
||||
else
|
||||
{
|
||||
if(val1 > val2)
|
||||
continue;
|
||||
else
|
||||
{
|
||||
// val1 and val2 are same
|
||||
same++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j< same; j++)
|
||||
corners[pos + j] = pt1;
|
||||
}
|
||||
__kernel
|
||||
void sortCorners_selectionSortFinal
|
||||
(
|
||||
image2d_t eig,
|
||||
__global float2 * corners,
|
||||
const int count
|
||||
)
|
||||
{
|
||||
const int i = get_local_id(0); // index in workgroup
|
||||
const int numOfGroups = get_num_groups(0); // index in workgroup
|
||||
const int groupID = get_group_id(0);
|
||||
const int wg = get_local_size(0); // workgroup size = block size
|
||||
int pos = 0, same = 0;
|
||||
const int offset = get_group_id(0) * wg;
|
||||
const int remainder = count - wg*(numOfGroups-1);
|
||||
|
||||
if((offset + i ) >= count)
|
||||
return;
|
||||
float2 pt1, pt2;
|
||||
pt1 = corners[groupID*wg + i];
|
||||
|
||||
float val1 = ELEM_FLT2(eig, pt1);
|
||||
float val2;
|
||||
|
||||
for(int j=0; j<numOfGroups-1; j++ )
|
||||
{
|
||||
for(int k=0; k<wg; k++)
|
||||
{
|
||||
pt2 = corners[j*wg + k];
|
||||
val2 = ELEM_FLT2(eig, pt2);
|
||||
if(val1 > val2)
|
||||
break;
|
||||
else
|
||||
{
|
||||
//Increment only if the value is not the same.
|
||||
if( val2 > val1 )
|
||||
pos++;
|
||||
else
|
||||
same++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int k=0; k<remainder; k++)
|
||||
{
|
||||
pt2 = corners[(numOfGroups-1)*wg + k];
|
||||
val2 = ELEM_FLT2(eig, pt2);
|
||||
if(val1 > val2)
|
||||
break;
|
||||
else
|
||||
{
|
||||
//Don't increment if the value is the same.
|
||||
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
|
||||
if(val2 > val1)
|
||||
pos++;
|
||||
else
|
||||
same++;
|
||||
}
|
||||
}
|
||||
for (int j=0; j< same; j++)
|
||||
corners[pos + j] = pt1;
|
||||
data[0] = minVal;
|
||||
data[1] = maxVal;
|
||||
}
|
Loading…
Reference in New Issue
Block a user