Add ocl's good features to track implementation.

Additional notes with this commit: 1. Add cornerHarris_dxdy and cornerMinEigenVal_dxdy to get the interim dx and dy output of Sobel operator; 2. Add minMax_buf to allow user to reuse buffers in minMax; 3. Fix an error when either min or max pointer fed into minMax is NULL; 4. Corner sorter temporarily uses C++ STL's quick sort. A parallel selection sort in OpneCL is contained in the implementation but disabled due to poor performance at the moment. 5. Accuracy test for ocl gfft.
2013-05-22 13:46:42 +08:00
parent d4255b7f75
commit b4a4a05bdc
8 changed files with 841 additions and 32 deletions
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -782,45 +782,55 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
    }
 }

-template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
+template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal, double *maxVal,
+                                             const oclMat &mask, oclMat &buf)
 {
    size_t groupnum = src.clCxt->computeUnits();
    CV_Assert(groupnum != 0);
    groupnum = groupnum * 2;
    int vlen = 8;
    int dbsize = groupnum * 2 * vlen * sizeof(T) ;
-    Context *clCxt = src.clCxt;
-    cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
-    *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
+
+    ensureSizeIsEnough(1, dbsize, CV_8UC1, buf);
+
+    cl_mem buf_data = reinterpret_cast<cl_mem>(buf.data);
+
    if (mask.empty())
    {
-        arithmetic_minMax_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax");
+        arithmetic_minMax_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax");
    }
    else
    {
-        arithmetic_minMax_mask_run(src, mask, dstBuffer, vlen, groupnum, "arithm_op_minMax_mask");
+        arithmetic_minMax_mask_run(src, mask, buf_data, vlen, groupnum, "arithm_op_minMax_mask");
    }
-    T *p = new T[groupnum * vlen * 2];
-    memset(p, 0, dbsize);
-    openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
-    if(minVal != NULL){
+
+    Mat matbuf = Mat(buf);
+    T *p = matbuf.ptr<T>();
+    if(minVal != NULL)
+    {
+        *minVal = std::numeric_limits<double>::max();
        for(int i = 0; i < vlen * (int)groupnum; i++)
        {
            *minVal = *minVal < p[i] ? *minVal : p[i];
        }
    }
-    if(maxVal != NULL){
+    if(maxVal != NULL)
+    {
+        *maxVal = -std::numeric_limits<double>::max();
        for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
        {
            *maxVal = *maxVal > p[i] ? *maxVal : p[i];
        }
    }
-    delete[] p;
-    openCLFree(dstBuffer);
 }

-typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask);
+typedef void (*minMaxFunc)(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf);
 void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask)
+{
+    oclMat buf;
+    minMax_buf(src, minVal, maxVal, mask, buf);
+}
+void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
    CV_Assert(src.oclchannels() == 1);
    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
@@ -840,7 +850,7 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
    };
    minMaxFunc func;
    func = functab[src.depth()];
-    func(src, minVal, maxVal, mask);
+    func(src, minVal, maxVal, mask, buf);
 }

 //////////////////////////////////////////////////////////////////////////////
--- a/modules/ocl/src/gfft.cpp
+++ b/modules/ocl/src/gfft.cpp
@@ -0,0 +1,351 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include <iomanip>
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::ocl;
+
+static bool use_cpu_sorter = true;
+
+namespace cv
+{
+    namespace ocl
+    {
+        ///////////////////////////OpenCL kernel strings///////////////////////////
+        extern const char *imgproc_gfft;
+    }
+}
+
+namespace
+{
+enum SortMethod
+{
+    CPU_STL,
+    BITONIC,
+    SELECTION
+};
+
+const int GROUP_SIZE = 256;
+
+template<SortMethod method>
+struct Sorter
+{
+    //typedef EigType;
+};
+
+//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
+template<>
+struct Sorter<CPU_STL>
+{
+    typedef oclMat EigType;
+    static cv::Mutex cs;
+    static Mat mat_eig;
+
+    //prototype
+    static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
+    {
+        float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
+        float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
+        return v1 > v2;
+    }
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        cv::AutoLock lock(cs);
+        //temporarily use STL's sort function
+        Mat mat_corners = corners;
+        mat_eig = eig_tex;
+        std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
+        corners = mat_corners;
+    }
+};
+cv::Mutex Sorter<CPU_STL>::cs;
+cv::Mat   Sorter<CPU_STL>::mat_eig;
+
+template<>
+struct Sorter<BITONIC>
+{
+    typedef TextureCL EigType;
+
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        Context * cxt = Context::getContext();
+        size_t globalThreads[3] = {count / 2, 1, 1};
+        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+        // 2^numStages should be equal to count or the output is invalid
+        int numStages = 0;
+        for(int i = count; i > 1; i >>= 1)
+        {
+            ++numStages;
+        }
+        const int argc = 5;
+        std::vector< std::pair<size_t, const void *> > args(argc);
+        std::string kernelname = "sortCorners_bitonicSort";
+        args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
+        args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
+        args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
+        for(int stage = 0; stage < numStages; ++stage)
+        {
+            args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
+            for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
+            {
+                args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+                openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+            }
+        }
+    }
+};
+
+template<>
+struct Sorter<SELECTION>
+{
+    typedef TextureCL EigType;
+
+    static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
+    {
+        Context * cxt = Context::getContext();
+        
+        size_t globalThreads[3] = {count, 1, 1};
+        size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+        std::vector< std::pair<size_t, const void *> > args;
+        //local
+        std::string kernelname = "sortCorners_selectionSortLocal";
+        int lds_size = GROUP_SIZE * sizeof(cl_float2);
+        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
+        args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
+        args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
+        args.push_back( std::make_pair( lds_size,       (void*)NULL) );
+
+        openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+
+        //final
+        kernelname = "sortCorners_selectionSortFinal";
+        args.pop_back();
+        openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1);
+    }
+};
+
+int findCorners_caller(
+    const TextureCL& eig, 
+    const float threshold,
+    const oclMat& mask,
+    oclMat& corners,
+    const int max_count)
+{
+    std::vector<int> k;
+    Context * cxt = Context::getContext();
+
+    std::vector< std::pair<size_t, const void*> > args;
+    std::string kernelname = "findCorners";
+
+    const int mask_strip = mask.step / mask.elemSize1();
+
+    oclMat g_counter(1, 1, CV_32SC1);
+    g_counter.setTo(0);
+
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&eig  ));
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&mask.data ));
+    args.push_back(make_pair( sizeof(cl_mem),   (void*)&corners.data ));
+    args.push_back(make_pair( sizeof(cl_int),   (void*)&mask_strip));
+    args.push_back(make_pair( sizeof(cl_float), (void*)&threshold ));
+    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows ));
+    args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols ));
+    args.push_back(make_pair( sizeof(cl_int), (void*)&max_count ));
+    args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
+
+    size_t globalThreads[3] = {eig.cols, eig.rows, 1};
+    size_t localThreads[3]  = {16, 16, 1};
+
+    const char * opt = mask.empty() ? "" : "-D WITH_MASK";
+    openCLExecuteKernel(cxt, &imgproc_gfft, kernelname, globalThreads, localThreads, args, -1, -1, opt);
+    return std::min(Mat(g_counter).at<int>(0), max_count);
+}
+}//unnamed namespace
+
+void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
+{
+    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
+
+    CV_DbgAssert(support_image2d());
+
+    ensureSizeIsEnough(image.size(), CV_32F, eig_);
+
+    if (useHarrisDetector)
+        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
+    else
+        cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
+
+    double maxVal = 0;
+    minMax_buf(eig_, 0, &maxVal, oclMat(), minMaxbuf_);
+
+    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+
+    Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
+    int total = findCorners_caller(
+        *eig_tex,
+        static_cast<float>(maxVal * qualityLevel),
+        mask,
+        tmpCorners_,
+        tmpCorners_.cols);
+
+    if (total == 0)
+    {
+        corners.release();
+        return;
+    }
+    if(use_cpu_sorter)
+    {
+        Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
+    }
+    else
+    {
+        //if total is power of 2
+        if(((total - 1) & (total)) == 0)
+        {
+            Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+        }
+        else
+        {
+            Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
+        }
+    }
+    
+    if (minDistance < 1)
+    {
+        corners = tmpCorners_(Rect(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1));
+    }
+    else
+    {
+        vector<Point2f> tmp(total);
+        downloadPoints(tmpCorners_, tmp);
+
+        vector<Point2f> tmp2;
+        tmp2.reserve(total);
+
+        const int cell_size = cvRound(minDistance);
+        const int grid_width = (image.cols + cell_size - 1) / cell_size;
+        const int grid_height = (image.rows + cell_size - 1) / cell_size;
+
+        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+
+        for (int i = 0; i < total; ++i)
+        {
+            Point2f p = tmp[i];
+
+            bool good = true;
+
+            int x_cell = static_cast<int>(p.x / cell_size);
+            int y_cell = static_cast<int>(p.y / cell_size);
+
+            int x1 = x_cell - 1;
+            int y1 = y_cell - 1;
+            int x2 = x_cell + 1;
+            int y2 = y_cell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(grid_width - 1, x2);
+            y2 = std::min(grid_height - 1, y2);
+
+            for (int yy = y1; yy <= y2; yy++)
+            {
+                for (int xx = x1; xx <= x2; xx++)
+                {
+                    vector<Point2f>& m = grid[yy * grid_width + xx];
+
+                    if (!m.empty())
+                    {
+                        for(size_t j = 0; j < m.size(); j++)
+                        {
+                            float dx = p.x - m[j].x;
+                            float dy = p.y - m[j].y;
+
+                            if (dx * dx + dy * dy < minDistance * minDistance)
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[y_cell * grid_width + x_cell].push_back(p);
+
+                tmp2.push_back(p);
+
+                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
+                    break;
+            }
+        }
+
+        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+    }
+}
+void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector<Point2f> &points_v)
+{
+    CV_DbgAssert(points.type() == CV_32FC2);
+    points_v.resize(points.cols);
+    openCLSafeCall(clEnqueueReadBuffer(
+        *reinterpret_cast<cl_command_queue*>(getoclCommandQueue()), 
+        reinterpret_cast<cl_mem>(points.data), 
+        CL_TRUE,                                    
+        0, 
+        points.cols * sizeof(Point2f), 
+        &points_v[0], 
+        0, 
+        NULL, 
+        NULL));
+}
+
+
--- a/modules/ocl/src/imgproc.cpp
+++ b/modules/ocl/src/imgproc.cpp
@@ -1207,30 +1207,41 @@ namespace cv
        void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize,
                          double k, int borderType)
        {
-            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
-            {
-                CV_Error(CV_GpuNotSupported, "select device don't support double");
-            }
-            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-            oclMat Dx, Dy;
-            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-            extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
-            dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), Dx, Dy, dst, borderType);
+            oclMat dx, dy;
+            cornerHarris_dxdy(src, dst, dx, dy, blockSize, ksize, k, borderType);
        }

-        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
+        void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize,
+                          double k, int borderType)
        {
            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
            {
                CV_Error(CV_GpuNotSupported, "select device don't support double");
            }
            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
-            oclMat Dx, Dy;
            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
-            extractCovData(src, Dx, Dy, blockSize, ksize, borderType);
+            extractCovData(src, dx, dy, blockSize, ksize, borderType);
            dst.create(src.size(), CV_32F);
-            corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, Dx, Dy, dst, borderType);
+            corner_ocl(imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
+        }
+
+        void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int borderType)
+        {
+            oclMat dx, dy;
+            cornerMinEigenVal_dxdy(src, dst, dx, dy, blockSize, ksize, borderType);
+        }
+        
+        void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &dx, oclMat &dy, int blockSize, int ksize, int borderType)
+        {
+            if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+            {
+                CV_Error(CV_GpuNotSupported, "select device don't support double");
+            }
+            CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
+            CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+            extractCovData(src, dx, dy, blockSize, ksize, borderType);
+            dst.create(src.size(), CV_32F);
+            corner_ocl(imgproc_calcMinEigenVal, "calcMinEigenVal", blockSize, 0, dx, dy, dst, borderType);
        }
        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
        static void meanShiftFiltering_gpu(const oclMat &src, oclMat dst, int sp, int sr, int maxIter, float eps)
--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
@@ -156,7 +156,7 @@ namespace cv
                format.image_channel_order     = CL_RGBA;
                break;
            default:
-                CV_Error(-1, "Image forma is not supported");
+                CV_Error(-1, "Image format is not supported");
                break;
            }
 #ifdef CL_VERSION_1_2
@@ -225,6 +225,11 @@ namespace cv
            openCLSafeCall(err);
            return texture;
        }
+        Ptr<TextureCL> bindTexturePtr(const oclMat &mat)
+        {
+            return Ptr<TextureCL>(new TextureCL(bindTexture(mat), mat.rows, mat.cols, mat.type()));
+        }
+
        void releaseTexture(cl_mem& texture)
        {
            openCLFree(texture);
--- a/modules/ocl/src/opencl/imgproc_gfft.cl
+++ b/modules/ocl/src/opencl/imgproc_gfft.cl
@@ -0,0 +1,276 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WITH_MASK
+#define WITH_MASK 0
+#endif
+
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+inline float ELEM_INT2(image2d_t _eig, int _x, int _y) 
+{
+    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
+}
+
+inline float ELEM_FLT2(image2d_t _eig, float2 pt) 
+{
+    return read_imagef(_eig, sampler, pt).x;
+}
+
+__kernel
+    void findCorners
+    (
+        image2d_t eig,
+        __global const char * mask,
+        __global float2 * corners,
+        const int mask_strip,// in pixels
+        const float threshold,
+        const int rows,
+        const int cols,
+        const int max_count,
+        __global int * g_counter
+    )
+{
+    const int j = get_global_id(0);
+    const int i = get_global_id(1);
+
+    if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
+#if WITH_MASK
+        && mask[i * mask_strip + j] != 0
+#endif
+        )
+    {
+        const float val = ELEM_INT2(eig, j, i);
+
+        if (val > threshold)
+        {
+            float maxVal = val;
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
+
+            if (val == maxVal)
+            {
+                const int ind = atomic_inc(g_counter);
+
+                if (ind < max_count)
+                    corners[ind] = (float2)(j, i);
+            }
+        }
+    }
+}
+
+//bitonic sort
+__kernel
+    void sortCorners_bitonicSort
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        const int stage,
+        const int passOfStage
+    )
+{
+    const int threadId = get_global_id(0);
+    if(threadId >= count / 2)
+    {
+        return;
+    }
+
+    const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
+
+    const int pairDistance = 1 << (stage - passOfStage);
+    const int blockWidth   = 2 * pairDistance;
+
+    const int leftId = min( (threadId % pairDistance) 
+                   + (threadId / pairDistance) * blockWidth, count );
+
+    const int rightId = min( leftId + pairDistance, count );
+
+    const float2 leftPt  = corners[leftId];
+    const float2 rightPt = corners[rightId];
+
+    const float leftVal  = ELEM_FLT2(eig, leftPt);
+    const float rightVal = ELEM_FLT2(eig, rightPt);
+
+    const bool compareResult = leftVal > rightVal;
+
+    float2 greater = compareResult ? leftPt:rightPt;
+    float2 lesser  = compareResult ? rightPt:leftPt;
+    
+    corners[leftId]  = sortOrder ? lesser : greater;
+    corners[rightId] = sortOrder ? greater : lesser;
+}
+
+//selection sort for gfft
+//kernel is ported from Bolt library:
+//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+//  Local sort will firstly sort elements of each workgroup using selection sort
+//  its performance is O(n)
+__kernel
+    void sortCorners_selectionSortLocal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        __local float2 * scratch
+    )
+{
+    int          i  = get_local_id(0); // index in workgroup
+    int numOfGroups = get_num_groups(0); // index in workgroup
+    int groupID     = get_group_id(0);
+    int         wg  = get_local_size(0); // workgroup size = block size
+    int n; // number of elements to be processed for this work group
+
+    int offset   = groupID * wg;
+    int same     = 0;
+    corners      += offset;
+    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
+    float2 pt1, pt2;
+
+    pt1 = corners[min(i, n)];
+    scratch[i] = pt1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(i >= n)
+    {
+        return;
+    }
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    int pos = 0;
+    for (int j=0;j<n;++j)
+    {
+        pt2  = scratch[j];
+        val2 = ELEM_FLT2(eig, pt2);
+        if(val2 > val1) 
+            pos++;//calculate the rank of this element in this work group
+        else 
+        {
+            if(val1 > val2)
+                continue;
+            else 
+            {
+                // val1 and val2 are same
+                same++;
+            }
+        }
+    }
+    for (int j=0; j< same; j++)      
+        corners[pos + j] = pt1;
+}
+__kernel
+    void sortCorners_selectionSortFinal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count
+    )
+{
+    const int          i  = get_local_id(0); // index in workgroup
+    const int numOfGroups = get_num_groups(0); // index in workgroup
+    const int groupID     = get_group_id(0);
+    const int         wg  = get_local_size(0); // workgroup size = block size
+    int pos = 0, same = 0;
+    const int offset = get_group_id(0) * wg;
+    const int remainder = count - wg*(numOfGroups-1);
+
+    if((offset + i ) >= count)
+        return;
+    float2 pt1, pt2;
+    pt1 = corners[groupID*wg + i];
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    for(int j=0; j<numOfGroups-1; j++ )
+    {
+        for(int k=0; k<wg; k++)
+        {
+            pt2  = corners[j*wg + k];
+            val2 = ELEM_FLT2(eig, pt2); 
+            if(val1 > val2)
+                break;
+            else
+            {
+                //Increment only if the value is not the same. 
+                if( val2 > val1 )
+                    pos++;
+                else 
+                    same++;
+            }
+        }
+    }
+
+    for(int k=0; k<remainder; k++)
+    {
+        pt2  = corners[(numOfGroups-1)*wg + k];
+        val2 = ELEM_FLT2(eig, pt2); 
+        if(val1 > val2)
+            break;
+        else
+        {
+            //Don't increment if the value is the same. 
+            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
+            if(val2 > val1)
+                pos++;
+            else 
+                same++;
+        }
+    }  
+    for (int j=0; j< same; j++)      
+        corners[pos + j] = pt1;
+}
+