Add ocl's good features to track implementation.

Additional notes with this commit:
1. Add cornerHarris_dxdy and cornerMinEigenVal_dxdy to get
the interim dx and dy output of Sobel operator;
2. Add minMax_buf to allow user to reuse buffers in minMax;
3. Fix an error when either min or max pointer fed into minMax is NULL;
4. Corner sorter temporarily uses C++ STL's quick sort. A parallel
 selection sort in OpneCL is contained in the implementation but disabled
due to poor performance at the moment.
5. Accuracy test for ocl gfft.
This commit is contained in:
peng xiao
2013-05-22 13:46:42 +08:00
parent d4255b7f75
commit b4a4a05bdc
8 changed files with 841 additions and 32 deletions

View File

@@ -782,45 +782,55 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
}
}
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
const oclMat &mask, oclMat &buf)
{
size_t groupnum = src.clCxt->computeUnits();
CV_Assert(groupnum != 0);
groupnum = groupnum * 2;
int vlen = 8;
int dbsize = groupnum * 2 * vlen * sizeof(T) ;
Context *clCxt = src.clCxt;
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
*minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
if (mask.empty())
{
arithmetic_minMax_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax");
arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax");
}
else
{
arithmetic_minMax_mask_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax_mask");
arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask");
}
T *p = new T[groupnum * vlen * 2];
memset(p, 0, dbsize);
openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
if(minVal != NULL){
Mat matbuf = Mat(buf);
T *p = matbuf.ptr<T>();
if(minVal != NULL)
{
*minVal = std::numeric_limits<double>::max();
for(int i = 0; i < vlen * (int)groupnum; i++)
{
*minVal = *minVal < p[i] ? *minVal : p[i];
}
}
if(maxVal != NULL){
if(maxVal != NULL)
{
*maxVal = -std::numeric_limits<double>::max();
for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
{
*maxVal = *maxVal > p[i] ? *maxVal : p[i];
}
}
delete[] p;
openCLFree(dstBuffer);
}
typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
{
oclMat buf;
minMax_buf(src, minVal, maxVal, mask, buf);
}
void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
{
CV_Assert(src.oclchannels() == 1);
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -840,7 +850,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
};
minMaxFunc func;
func = functab[src.depth()];
func(src, minVal, maxVal, mask);
func(src, minVal, maxVal, mask, buf);
}
//////////////////////////////////////////////////////////////////////////////

351
modules/ocl/src/gfft.cpp Normal file
View File

@@ -0,0 +1,351 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include <iomanip>
#include "precomp.hpp"
using namespace cv;
using namespace cv::ocl;
static bool use_cpu_sorter = true;
namespace cv
{
namespace ocl
{
///////////////////////////OpenCL kernel strings///////////////////////////
extern const char *imgproc_gfft;
}
}
namespace
{
enum SortMethod
{
CPU_STL,
BITONIC,
SELECTION
};
const int GROUP_SIZE = 256;
template<SortMethod method>
struct Sorter
{
//typedef EigType;
};
//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
template<>
struct Sorter<CPU_STL>
{
typedef oclMat EigType;
static cv::Mutex cs;
static Mat mat_eig;
//prototype
static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
{
float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
return v1 > v2;
}
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
cv::AutoLock lock(cs);
//temporarily use STL's sort function
Mat mat_corners = corners;
mat_eig = eig_tex;
std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
corners = mat_corners;
}
};
cv::Mutex Sorter<CPU_STL>::cs;
cv::Mat Sorter<CPU_STL>::mat_eig;
template<>
struct Sorter<BITONIC>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count / 2, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
// 2^numStages should be equal to count or the output is invalid
int numStages = 0;
for(int i = count; i > 1; i >>= 1)
{
++numStages;
}
const int argc = 5;
std::vector< std::pair<size_t, const void *> > args(argc);
std::string kernelname = "sortCorners_bitonicSort";
args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
for(int stage = 0; stage < numStages; ++stage)
{
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
{
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
}
}
}
};
template<>
struct Sorter<SELECTION>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
std::vector< std::pair<size_t, const void *> > args;
//local
std::string kernelname = "sortCorners_selectionSortLocal";
int lds_size = GROUP_SIZE * sizeof(cl_float2);
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
args.push_back( std::make_pair( lds_size, (void*)NULL) );
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
//final
kernelname = "sortCorners_selectionSortFinal";
args.pop_back();
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
}
};
int findCorners_caller(
const TextureCL& eig,
const float threshold,
const oclMat& mask,
oclMat& corners,
const int max_count)
{
std::vector<int> k;
Context * cxt = Context::getContext();
std::vector< std::pair<size_t, const void*> > args;
std::string kernelname = "findCorners";
const int mask_strip = mask.step / mask.elemSize1();
oclMat g_counter(1, 1, CV_32SC1);
g_counter.setTo(0);
args.push_back(make_pair( sizeof(cl_mem), (void*)&eig ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&mask.data ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&corners.data ));
args.push_back(make_pair( sizeof(cl_int), (void*)&mask_strip));
args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
size_t globalThreads[3] = {eig.cols, eig.rows, 1};
size_t localThreads[3] = {16, 16, 1};
const char * opt = mask.empty() ? "" : "-D WITH_MASK";
openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1, opt);
return std::min(Mat(g_counter).at<int>(0), max_count);
}
}//unnamed namespace
void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
{
CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
CV_DbgAssert(support_image2d());
ensureSizeIsEnough(image.size(), CV_32F, eig_);
if (useHarrisDetector)
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
else
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
double maxVal = 0;
minMax_buf(eig_, 0, &maxVal, oclMat(), minMaxbuf_);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
int total = findCorners_caller(
*eig_tex,
static_cast<float>(maxVal * qualityLevel),
mask,
tmpCorners_,
tmpCorners_.cols);
if (total == 0)
{
corners.release();
return;
}
if(use_cpu_sorter)
{
Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
}
else
{
//if total is power of 2
if(((total - 1) & (total)) == 0)
{
Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
else
{
Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
}
if (minDistance < 1)
{
corners = tmpCorners_(Rect(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1));
}
else
{
vector<Point2f> tmp(total);
downloadPoints(tmpCorners_, tmp);
vector<Point2f> tmp2;
tmp2.reserve(total);
const int cell_size = cvRound(minDistance);
const int grid_width = (image.cols + cell_size - 1) / cell_size;
const int grid_height = (image.rows + cell_size - 1) / cell_size;
std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
for (int i = 0; i < total; ++i)
{
Point2f p = tmp[i];
bool good = true;
int x_cell = static_cast<int>(p.x / cell_size);
int y_cell = static_cast<int>(p.y / cell_size);
int x1 = x_cell - 1;
int y1 = y_cell - 1;
int x2 = x_cell + 1;
int y2 = y_cell + 1;
// boundary check
x1 = std::max(0, x1);
y1 = std::max(0, y1);
x2 = std::min(grid_width - 1, x2);
y2 = std::min(grid_height - 1, y2);
for (int yy = y1; yy <= y2; yy++)
{
for (int xx = x1; xx <= x2; xx++)
{
vector<Point2f>& m = grid[yy * grid_width + xx];
if (!m.empty())
{
for(size_t j = 0; j < m.size(); j++)
{
float dx = p.x - m[j].x;
float dy = p.y - m[j].y;
if (dx * dx + dy * dy < minDistance * minDistance)
{
good = false;
goto break_out;
}
}
}
}
}
break_out:
if(good)
{
grid[y_cell * grid_width + x_cell].push_back(p);
tmp2.push_back(p);
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
break;
}
}
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
}
}
void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
{
CV_DbgAssert(points.type() == CV_32FC2);
points_v.resize(points.cols);
openCLSafeCall(clEnqueueReadBuffer(
*reinterpret_cast<cl_command_queue*>(getoclCommandQueue()),
reinterpret_cast<cl_mem>(points.data),
CL_TRUE,
0,
points.cols * sizeof(Point2f),
&points_v[0],
0,
NULL,
NULL));
}

View File

@@ -1207,30 +1207,41 @@ namespace cv
void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
double k, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(CV_GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
oclMat Dx, Dy;
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), Dx, Dy, dst, borderType);
oclMat dx, dy;
cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
}
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
double k, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(CV_GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
oclMat Dx, Dy;
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
extractCovData(src, dx, dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType);
corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
}
void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
{
oclMat dx, dy;
cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
}
void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
{
if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
{
CV_Error(CV_GpuNotSupported, "select device don't support double");
}
CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
extractCovData(src, dx, dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
}
/////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)

View File

@@ -156,7 +156,7 @@ namespace cv
format.image_channel_order = CL_RGBA;
break;
default:
CV_Error(-1, "Image forma is not supported");
CV_Error(-1, "Image format is not supported");
break;
}
#ifdef CL_VERSION_1_2
@@ -225,6 +225,11 @@ namespace cv
openCLSafeCall(err);
return texture;
}
Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
{
return Ptr<TextureCL>(new TextureCL(bindTexture(mat), mat.rows, mat.cols, mat.type()));
}
void releaseTexture(cl_mem& texture)
{
openCLFree(texture);

View File

@@ -0,0 +1,276 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
// Peng Xiao, pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other oclMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef WITH_MASK
#define WITH_MASK 0
#endif
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
{
return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
}
inline float ELEM_FLT2(image2d_t _eig, float2 pt)
{
return read_imagef(_eig, sampler, pt).x;
}
__kernel
void findCorners
(
image2d_t eig,
__global const char * mask,
__global float2 * corners,
const int mask_strip,// in pixels
const float threshold,
const int rows,
const int cols,
const int max_count,
__global int * g_counter
)
{
const int j = get_global_id(0);
const int i = get_global_id(1);
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
#if WITH_MASK
&& mask[i * mask_strip + j] != 0
#endif
)
{
const float val = ELEM_INT2(eig, j, i);
if (val > threshold)
{
float maxVal = val;
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = atomic_inc(g_counter);
if (ind < max_count)
corners[ind] = (float2)(j, i);
}
}
}
}
//bitonic sort
__kernel
void sortCorners_bitonicSort
(
image2d_t eig,
__global float2 * corners,
const int count,
const int stage,
const int passOfStage
)
{
const int threadId = get_global_id(0);
if(threadId >= count / 2)
{
return;
}
const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
const int pairDistance = 1 << (stage - passOfStage);
const int blockWidth = 2 * pairDistance;
const int leftId = min( (threadId % pairDistance)
+ (threadId / pairDistance) * blockWidth, count );
const int rightId = min( leftId + pairDistance, count );
const float2 leftPt = corners[leftId];
const float2 rightPt = corners[rightId];
const float leftVal = ELEM_FLT2(eig, leftPt);
const float rightVal = ELEM_FLT2(eig, rightPt);
const bool compareResult = leftVal > rightVal;
float2 greater = compareResult ? leftPt:rightPt;
float2 lesser = compareResult ? rightPt:leftPt;
corners[leftId] = sortOrder ? lesser : greater;
corners[rightId] = sortOrder ? greater : lesser;
}
//selection sort for gfft
//kernel is ported from Bolt library:
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
// Local sort will firstly sort elements of each workgroup using selection sort
// its performance is O(n)
__kernel
void sortCorners_selectionSortLocal
(
image2d_t eig,
__global float2 * corners,
const int count,
__local float2 * scratch
)
{
int i = get_local_id(0); // index in workgroup
int numOfGroups = get_num_groups(0); // index in workgroup
int groupID = get_group_id(0);
int wg = get_local_size(0); // workgroup size = block size
int n; // number of elements to be processed for this work group
int offset = groupID * wg;
int same = 0;
corners += offset;
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
float2 pt1, pt2;
pt1 = corners[min(i, n)];
scratch[i] = pt1;
barrier(CLK_LOCAL_MEM_FENCE);
if(i >= n)
{
return;
}
float val1 = ELEM_FLT2(eig, pt1);
float val2;
int pos = 0;
for (int j=0;j<n;++j)
{
pt2 = scratch[j];
val2 = ELEM_FLT2(eig, pt2);
if(val2 > val1)
pos++;//calculate the rank of this element in this work group
else
{
if(val1 > val2)
continue;
else
{
// val1 and val2 are same
same++;
}
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}
__kernel
void sortCorners_selectionSortFinal
(
image2d_t eig,
__global float2 * corners,
const int count
)
{
const int i = get_local_id(0); // index in workgroup
const int numOfGroups = get_num_groups(0); // index in workgroup
const int groupID = get_group_id(0);
const int wg = get_local_size(0); // workgroup size = block size
int pos = 0, same = 0;
const int offset = get_group_id(0) * wg;
const int remainder = count - wg*(numOfGroups-1);
if((offset + i ) >= count)
return;
float2 pt1, pt2;
pt1 = corners[groupID*wg + i];
float val1 = ELEM_FLT2(eig, pt1);
float val2;
for(int j=0; j<numOfGroups-1; j++ )
{
for(int k=0; k<wg; k++)
{
pt2 = corners[j*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Increment only if the value is not the same.
if( val2 > val1 )
pos++;
else
same++;
}
}
}
for(int k=0; k<remainder; k++)
{
pt2 = corners[(numOfGroups-1)*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Don't increment if the value is the same.
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
if(val2 > val1)
pos++;
else
same++;
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}