HOST side optimization for GFFT

2013-12-17 14:02:57 +04:00
parent 5d5527d03e
commit a63576e76d
3 changed files with 241 additions and 323 deletions
--- a/modules/ocl/include/opencv2/ocl/ocl.hpp
+++ b/modules/ocl/include/opencv2/ocl/ocl.hpp
@@ -1381,8 +1381,10 @@ namespace cv
            oclMat Dx_;
            oclMat Dy_;
            oclMat eig_;
+            oclMat eig_minmax_;
            oclMat minMaxbuf_;
            oclMat tmpCorners_;
+            oclMat counter_;
        };

        inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,
--- a/modules/ocl/src/gftt.cpp
+++ b/modules/ocl/src/gftt.cpp
@@ -48,63 +48,35 @@
 using namespace cv;
 using namespace cv::ocl;

+// currently sort procedure on the host is more efficient
 static bool use_cpu_sorter = true;

-namespace
+// compact structure for corners
+struct DefCorner
 {
-enum SortMethod
-{
-    CPU_STL,
-    BITONIC,
-    SELECTION
+    float eig;  //eigenvalue of corner
+    short x;    //x coordinate of corner point
+    short y;    //y coordinate of corner point
 } ;

-const int GROUP_SIZE = 256;
-
-template<SortMethod method>
-struct Sorter
+// compare procedure for corner
+//it is used for sort on the host side
+struct DefCornerCompare
 {
-    //typedef EigType;
-};
-
-//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
-template<>
-struct Sorter<CPU_STL>
+    bool operator()(const DefCorner a, const DefCorner b) const
    {
-    typedef oclMat EigType;
-    static cv::Mutex cs;
-    static Mat mat_eig;
-
-    //prototype
-    static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
-    {
-        float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
-        float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
-        return v1 > v2;
-    }
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
-    {
-        cv::AutoLock lock(cs);
-        //temporarily use STL's sort function
-        Mat mat_corners = corners;
-        mat_eig = eig_tex;
-        std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
-        corners = mat_corners;
+        return a.eig > b.eig;
    }
 };
-cv::Mutex Sorter<CPU_STL>::cs;
-cv::Mat   Sorter<CPU_STL>::mat_eig;

-template<>
-struct Sorter<BITONIC>
-{
-    typedef TextureCL EigType;
-
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+// sort corner point using opencl bitonicosrt implementation
+static void sortCorners_caller(oclMat& corners, const int count)
 {
    Context * cxt = Context::getContext();
-        size_t globalThreads[3] = {count / 2, 1, 1};
-        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+    int     GS = count/2;
+    int     LS = min(255,GS);
+    size_t  globalThreads[3] = {GS, 1, 1};
+    size_t  localThreads[3]  = {LS, 1, 1};

    // 2^numStages should be equal to count or the output is invalid
    int numStages = 0;
@@ -112,90 +84,106 @@ struct Sorter<BITONIC>
    {
        ++numStages;
    }
-        const int argc = 5;
+    const int argc = 4;
    std::vector< std::pair<size_t, const void *> > args(argc);
    std::string kernelname = "sortCorners_bitonicSort";
-        args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
-        args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
-        args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
+    args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
+    args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
    for(int stage = 0; stage < numStages; ++stage)
    {
-            args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
+        args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
        for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
        {
-                args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+            args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
            openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
        }
    }
 }
-};

-template<>
-struct Sorter<SELECTION>
-{
-    typedef TextureCL EigType;
-
-    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
-    {
-        Context * cxt = Context::getContext();
-
-        size_t globalThreads[3] = {count, 1, 1};
-        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
-
-        std::vector< std::pair<size_t, const void *> > args;
-        //local
-        std::string kernelname = "sortCorners_selectionSortLocal";
-        int lds_size = GROUP_SIZE * sizeof(cl_float2);
-        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
-        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
-        args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
-        args.push_back( std::make_pair( lds_size,       (void*)NULL) );
-
-        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-
-        //final
-        kernelname = "sortCorners_selectionSortFinal";
-        args.pop_back();
-        openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
-    }
-};
-
-int findCorners_caller(
-    const TextureCL& eig,
-    const float threshold,
+// find corners on matrix and put it into array
+void findCorners_caller(
+    const oclMat&   eig_mat,        //input matrix worth eigenvalues
+    oclMat&         eigMinMax,      //input with min and max values of eigenvalues
+    const float     qualityLevel,
    const oclMat&   mask,
-    oclMat& corners,
-    const int max_count)
+    oclMat&         corners,        //output array with detected corners
+    oclMat&         counter)        //output value with number of detected corners, have to be 0 before call
 {
+    string  opt;
    std::vector<int> k;
    Context * cxt = Context::getContext();

    std::vector< std::pair<size_t, const void*> > args;
-    std::string kernelname = "findCorners";

    const int mask_strip = mask.step / mask.elemSize1();

-    oclMat g_counter(1, 1, CV_32SC1);
-    g_counter.setTo(0);
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&(eig_mat.data)));

-    args.push_back(make_pair( sizeof(cl_mem),   (void*)&eig  ));
+    int src_pitch = (int)eig_mat.step;
+    args.push_back(make_pair( sizeof(cl_int),   (void*)&src_pitch ));
    args.push_back(make_pair( sizeof(cl_mem),   (void*)&mask.data ));
    args.push_back(make_pair( sizeof(cl_mem),   (void*)&corners.data ));
    args.push_back(make_pair( sizeof(cl_int),   (void*)&mask_strip));
-    args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
-    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
-    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
-    args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
-    args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&eigMinMax.data ));
+    args.push_back(make_pair( sizeof(cl_float), (void*)&qualityLevel ));
+    args.push_back(make_pair( sizeof(cl_int),   (void*)&eig_mat.rows ));
+    args.push_back(make_pair( sizeof(cl_int),   (void*)&eig_mat.cols ));
+    args.push_back(make_pair( sizeof(cl_int),   (void*)&corners.cols ));
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&counter.data ));

-    size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+    size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
    size_t localThreads[3]  = {16, 16, 1};
+    if(!mask.empty())
+        opt += " -D WITH_MASK=1";

-    const char * opt = mask.empty() ? "" : "-D WITH_MASK";
-    openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
-    return std::min(Mat(g_counter).at<int>(0), max_count);
+     openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
+}
+
+
+static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
+{
+    size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
+    CV_Assert(groupnum != 0);
+
+    int dbsize = groupnum * 2 * src.elemSize();
+
+    ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
+
+    cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
+
+    int all_cols = src.step / src.elemSize();
+    int pre_cols = (src.offset % src.step) / src.elemSize();
+    int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
+    int invalid_cols = pre_cols + sec_cols;
+    int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
+    int offset = src.offset / src.elemSize();
+
+    {// first parallel pass
+        vector<pair<size_t , const void *> > args;
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&cols ));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&offset));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
+        size_t globalThreads[3] = {groupnum * 256, 1, 1};
+        size_t localThreads[3] = {256, 1, 1};
+        openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
+                            args, -1, -1, "-D T=float -D DEPTH_5");
+    }
+
+    {// run final "serial" kernel to find accumulate results from threads and reset corner counter
+        vector<pair<size_t , const void *> > args;
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data ));
+        args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum ));
+        args.push_back( make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
+        size_t globalThreads[3] = {1, 1, 1};
+        size_t localThreads[3] = {1, 1, 1};
+        openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
+                            args, -1, -1);
+    }
 }
-}//unnamed namespace

 void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
 {
@@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
    ensureSizeIsEnough(image.size(), CV_32F, eig_);

    if (useHarrisDetector)
-        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+        cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
    else
        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);

-    double maxVal = 0;
-    minMax(eig_, NULL, &maxVal);
+    ensureSizeIsEnough(1,1, CV_32SC1, counter_);

-    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+    // find max eigenvalue and reset detected counters
+    minMaxEig_caller(eig_,eig_minmax_,counter_);

-    Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
-    int total = findCorners_caller(
-        *eig_tex,
-        static_cast<float>(maxVal * qualityLevel),
+    // allocate buffer for kernels
+    int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
+
+    if(!use_cpu_sorter)
+    {   // round to 2^n
+        unsigned int n=1;
+        for(n=1;n<(unsigned int)corner_array_size;n<<=1);
+        corner_array_size = (int)n;
+
+        ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
+
+        // set to 0 to be able use bitonic sort on whole 2^n array
+        tmpCorners_.setTo(0);
+    }
+    else
+    {
+        ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
+    }
+
+    int total = tmpCorners_.cols; // by default the number of corner is full array
+    vector<DefCorner>   tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
+
+    //find points with high eigenvalue and put it into the output array
+    findCorners_caller(
+        eig_,
+        eig_minmax_,
+        static_cast<float>(qualityLevel),
        mask,
        tmpCorners_,
-        tmpCorners_.cols);
+        counter_);
+
+    if(!use_cpu_sorter)
+    {// sort detected corners on deivce side
+        sortCorners_caller(tmpCorners_, corner_array_size);
+    }
+    else
+    {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
+        openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
+    }
+
+    //blocking read whole corners array (sorted or not sorted)
+    openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));

    if (total == 0)
-    {
+    {// check for trivial case
        corners.release();
        return;
    }
+
    if(use_cpu_sorter)
-    {
-        Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
-    }
-    else
-    {
-        //if total is power of 2
-        if(((total - 1) & (total)) == 0)
-        {
-            Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
-        }
-        else
-        {
-            Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
-        }
+    {// sort detected corners on cpu side.
+        tmp.resize(total);
+        cv::sort(tmp,DefCornerCompare());
    }

+    //estimate maximal size of final output array
+    int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
+    int D2 = (int)ceil(minDistance * minDistance);
+    // allocate output buffer
+    vector<Point2f> tmp2;
+    tmp2.reserve(total_max);
+
+
    if (minDistance < 1)
+    {// we have not distance restriction. then just copy with conversion maximal allowed points into output array
+        for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
        {
-        Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
-        tmpCorners_(roi_range).copyTo(corners);
+            tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
+        }
    }
    else
-    {
-        vector<Point2f> tmp(total);
-        downloadPoints(tmpCorners_, tmp);
-
-        vector<Point2f> tmp2;
-        tmp2.reserve(total);
-
+    {// we have distance restriction. then start coping to output array from the first element and check distance for each next one
        const int cell_size = cvRound(minDistance);
        const int grid_width = (image.cols + cell_size - 1) / cell_size;
        const int grid_height = (image.rows + cell_size - 1) / cell_size;

-        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+        std::vector< std::vector<Point2i> > grid(grid_width * grid_height);

        for (int i = 0; i < total ; ++i)
        {
-            Point2f p = tmp[i];
+            DefCorner p = tmp[i];
+
+            if(p.eig<=0.0f)
+                break; // condition to stop that is needed for GPU bitonic sort usage.

            bool good = true;

@@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
            {
                for (int xx = x1; xx <= x2; xx++)
                {
-                    vector<Point2f>& m = grid[yy * grid_width + xx];
-
-                    if (!m.empty())
-                    {
+                    vector<Point2i>& m = grid[yy * grid_width + xx];
+                    if (m.empty())
+                        continue;
                    for(size_t j = 0; j < m.size(); j++)
                    {
-                            float dx = p.x - m[j].x;
-                            float dy = p.y - m[j].y;
+                        int dx = p.x - m[j].x;
+                        int dy = p.y - m[j].y;

-                            if (dx * dx + dy * dy < minDistance * minDistance)
+                        if (dx * dx + dy * dy < D2)
                        {
                            good = false;
-                                goto break_out;
-                            }
+                            goto break_out_;
                        }
                    }
                }
            }

-            break_out:
+            break_out_:

            if(good)
            {
-                grid[y_cell * grid_width + x_cell].push_back(p);
+                grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));

-                tmp2.push_back(p);
+                tmp2.push_back(Point2f(p.x,p.y));

                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
                    break;
            }
        }

-        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
    }
+    int final_size = static_cast<int>(tmp2.size());
+    if(final_size>0)
+        corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
+    else
+        corners.release();
 }
 void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
 {
--- a/modules/ocl/src/opencl/imgproc_gftt.cl
+++ b/modules/ocl/src/opencl/imgproc_gftt.cl
@@ -46,33 +46,26 @@
 #ifndef WITH_MASK
 #define WITH_MASK 0
 #endif
-
-__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
-
-inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
-{
-    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
-}
-
-inline float ELEM_FLT2(image2d_t _eig, float2 pt)
-{
-    return read_imagef(_eig, sampler, pt).x;
-}
+//macro to read eigenvalue matrix
+#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x]

 __kernel
    void findCorners
    (
-        image2d_t eig,
+        __global const char*    eig,
+        const int               eig_pitch,
        __global const char*    mask,
        __global float2*        corners,
        const int               mask_strip,// in pixels
-        const float threshold,
+        __global const float*   pMinMax,
+        const float             qualityLevel,
        const int               rows,
        const int               cols,
        const int               max_count,
        __global int*           g_counter
    )
 {
+    float threshold = qualityLevel*pMinMax[1];
    const int j = get_global_id(0);
    const int i = get_global_id(1);

@@ -82,39 +75,42 @@ __kernel
 #endif
        )
    {
-        const float val = ELEM_INT2(eig, j, i);
+        const float val = GET_SRC_32F(j, i);

        if (val > threshold)
        {
            float maxVal = val;
+            maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j    , i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal);

-            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal);
+            maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal);

-            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
-
-            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
-            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j    , i + 1), maxVal);
+            maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal);

            if (val == maxVal)
            {
                const int ind = atomic_inc(g_counter);

                if (ind < max_count)
-                    corners[ind] = (float2)(j, i);
+                {// pack and store eigenvalue and its coordinates
+                    corners[ind].x = val;
+                    corners[ind].y = as_float(j|(i<<16));
                }
            }
        }
    }
+}
+#undef GET_SRC_32F
+

 //bitonic sort
 __kernel
    void sortCorners_bitonicSort
    (
-        image2d_t eig,
        __global float2 * corners,
        const int count,
        const int stage,
@@ -140,8 +136,8 @@ __kernel
    const float2 leftPt  = corners[leftId];
    const float2 rightPt = corners[rightId];

-    const float leftVal  = ELEM_FLT2(eig, leftPt);
-    const float rightVal = ELEM_FLT2(eig, rightPt);
+    const float leftVal  = leftPt.x;
+    const float rightVal = rightPt.x;

    const bool compareResult = leftVal > rightVal;

@@ -152,124 +148,22 @@ __kernel
    corners[rightId] = sortOrder ? greater : lesser;
 }

-//selection sort for gfft
-//kernel is ported from Bolt library:
-//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
-//  Local sort will firstly sort elements of each workgroup using selection sort
-//  its performance is O(n)
-__kernel
-    void sortCorners_selectionSortLocal
-    (
-        image2d_t eig,
-        __global float2 * corners,
-        const int count,
-        __local float2 * scratch
-    )
+// this is simple short serial kernel that makes some short reduction and initialization work
+// it makes HOST like work to avoid additional sync with HOST to do this short work
+// data - input/output float2.
+//      input data are sevral (min,max) pairs
+//      output data is one reduced (min,max) pair
+// g_counter - counter that have to be initialized by 0 for next findCorner call.
+__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter)
 {
-    int          i  = get_local_id(0); // index in workgroup
-    int numOfGroups = get_num_groups(0); // index in workgroup
-    int groupID     = get_group_id(0);
-    int         wg  = get_local_size(0); // workgroup size = block size
-    int n; // number of elements to be processed for this work group
-
-    int offset   = groupID * wg;
-    int same     = 0;
-    corners      += offset;
-    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
-    float2 pt1, pt2;
-
-    pt1 = corners[min(i, n)];
-    scratch[i] = pt1;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(i >= n)
+    g_counter[0] = 0;
+    float minVal = data[0];
+    float maxVal = data[groupnum];
+    for(int i=1;i<groupnum;++i)
    {
-        return;
+        minVal = min(minVal,data[i]);
+        maxVal = max(maxVal,data[i+groupnum]);
    }
-
-    float val1 = ELEM_FLT2(eig, pt1);
-    float val2;
-
-    int pos = 0;
-    for (int j=0;j<n;++j)
-    {
-        pt2  = scratch[j];
-        val2 = ELEM_FLT2(eig, pt2);
-        if(val2 > val1)
-            pos++;//calculate the rank of this element in this work group
-        else
-        {
-            if(val1 > val2)
-                continue;
-            else
-            {
-                // val1 and val2 are same
-                same++;
-            }
-        }
-    }
-    for (int j=0; j< same; j++)
-        corners[pos + j] = pt1;
-}
-__kernel
-    void sortCorners_selectionSortFinal
-    (
-        image2d_t eig,
-        __global float2 * corners,
-        const int count
-    )
-{
-    const int          i  = get_local_id(0); // index in workgroup
-    const int numOfGroups = get_num_groups(0); // index in workgroup
-    const int groupID     = get_group_id(0);
-    const int         wg  = get_local_size(0); // workgroup size = block size
-    int pos = 0, same = 0;
-    const int offset = get_group_id(0) * wg;
-    const int remainder = count - wg*(numOfGroups-1);
-
-    if((offset + i ) >= count)
-        return;
-    float2 pt1, pt2;
-    pt1 = corners[groupID*wg + i];
-
-    float val1 = ELEM_FLT2(eig, pt1);
-    float val2;
-
-    for(int j=0; j<numOfGroups-1; j++ )
-    {
-        for(int k=0; k<wg; k++)
-        {
-            pt2  = corners[j*wg + k];
-            val2 = ELEM_FLT2(eig, pt2);
-            if(val1 > val2)
-                break;
-            else
-            {
-                //Increment only if the value is not the same.
-                if( val2 > val1 )
-                    pos++;
-                else
-                    same++;
-            }
-        }
-    }
-
-    for(int k=0; k<remainder; k++)
-    {
-        pt2  = corners[(numOfGroups-1)*wg + k];
-        val2 = ELEM_FLT2(eig, pt2);
-        if(val1 > val2)
-            break;
-        else
-        {
-            //Don't increment if the value is the same.
-            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
-            if(val2 > val1)
-                pos++;
-            else
-                same++;
-        }
-    }
-    for (int j=0; j< same; j++)
-        corners[pos + j] = pt1;
+    data[0] = minVal;
+    data[1] = maxVal;
 }