Merge pull request #1189 from pengx17:2.4_sort_by_key

2013-08-07 11:41:23 +04:00
parent a7da129984 8b90cb375a
commit 95bdd4b670
7 changed files with 1477 additions and 1 deletions
--- a/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
@@ -0,0 +1,176 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 
+
+#ifndef N   // number of radices
+#define N 4
+#endif
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT 0
+#endif
+
+
+// from Thrust::b40c, link:
+// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
+__inline uint convertKey(uint converted_key)
+{
+#ifdef K_FLT
+    unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000;
+    converted_key ^= mask;
+#elif defined(K_INT)
+    const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
+    converted_key ^= SIGN_MASK;	
+#else
+
+#endif
+    return converted_key;
+}
+
+//FIXME(pengx17): 
+// exclusive scan, need to be optimized as this is too naive...
+kernel
+    void naiveScanAddition(
+    __global int * input,
+    __global int * output,
+    int size
+    )
+{
+    if(get_global_id(0) == 0)
+    {
+        output[0] = 0;
+        for(int i = 1; i < size; i ++)
+        {
+            output[i] = output[i - 1] + input[i - 1];
+        }
+    }
+}
+
+// following is ported from
+// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl
+kernel
+    void histogramRadixN (
+    __global K_T* unsortedKeys,
+    __global int * buckets,
+    uint shiftCount
+    )
+{
+    const int RADIX_T     = N;
+    const int RADICES_T   = (1 << RADIX_T);
+    const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T; 
+    const int MASK_T      = (1 << RADIX_T) - 1;
+    int localBuckets[16] = {0,0,0,0,0,0,0,0,
+                            0,0,0,0,0,0,0,0};
+    int globalId    = get_global_id(0);
+    int numOfGroups = get_num_groups(0);
+
+    /* Calculate thread-histograms */
+    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
+    {
+        uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)]));
+        value = (value >> shiftCount) & MASK_T;
+#if IS_GT
+        localBuckets[RADICES_T - value - 1]++;
+#else
+        localBuckets[value]++;
+#endif
+    }
+
+    for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
+    {
+        buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i];
+    }
+}
+
+kernel
+    void permuteRadixN (
+    __global K_T*  unsortedKeys,
+    __global V_T*  unsortedVals,
+    __global int* scanedBuckets,
+    uint shiftCount,
+    __global K_T*  sortedKeys,
+    __global V_T*  sortedVals
+    )
+{
+    const int RADIX_T     = N;
+    const int RADICES_T   = (1 << RADIX_T);
+    const int MASK_T = (1<<RADIX_T)  -1;
+
+    int globalId  = get_global_id(0);
+    int numOfGroups = get_num_groups(0);
+    const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N;
+    int  localIndex[16];
+
+    /*Load the index to local memory*/
+    for(int i = 0; i < RADICES_T; ++i)
+    {
+#if IS_GT
+        localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
+#else
+        localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
+#endif
+    }
+    /* Permute elements to appropriate location */
+    for(int i = 0; i < RADICES_T; ++i)
+    {
+        int old_idx = mad24(globalId, RADICES_T, i);
+        K_T  ovalue = unsortedKeys[old_idx];
+        uint value = convertKey(as_uint(ovalue));
+        uint maskedValue = (value >> shiftCount) & MASK_T;
+        uint index = localIndex[maskedValue];
+        sortedKeys[index] = ovalue;
+        sortedVals[index] = unsortedVals[old_idx];
+        localIndex[maskedValue] = index + 1;
+    }
+}
--- a/modules/ocl/src/opencl/kernel_sort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_sort_by_key.cl
@@ -0,0 +1,245 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT false
+#endif
+
+#if IS_GT
+#define my_comp(x,y) ((x) > (y))
+#else
+#define my_comp(x,y) ((x) < (y))
+#endif
+
+/////////////////////// Bitonic sort ////////////////////////////
+// ported from 
+// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl
+__kernel
+    void bitonicSort
+    (
+        __global K_T * keys,
+        __global V_T * vals,
+        int count,
+        int stage,
+        int passOfStage
+    )
+{
+    const int threadId = get_global_id(0);
+    if(threadId >= count / 2)
+    {
+        return;
+    }
+    const int pairDistance = 1 << (stage - passOfStage);
+    const int blockWidth   = 2 * pairDistance;
+
+    int leftId = min( (threadId % pairDistance) 
+                   + (threadId / pairDistance) * blockWidth, count );
+
+    int rightId = min( leftId + pairDistance, count );
+
+    int temp;
+
+    const V_T lval = vals[leftId];
+    const V_T rval = vals[rightId]; 
+
+    const K_T lkey = keys[leftId];
+    const K_T rkey = keys[rightId];
+
+    int sameDirectionBlockWidth = 1 << stage;
+
+    if((threadId/sameDirectionBlockWidth) % 2 == 1)
+    {
+        temp = rightId;
+        rightId = leftId;
+        leftId = temp;
+    }
+
+    const bool compareResult = my_comp(lkey, rkey);
+
+    if(compareResult)
+    {
+        keys[rightId] = rkey;
+        keys[leftId]  = lkey;
+        vals[rightId] = rval;
+        vals[leftId]  = lval;
+    }
+    else
+    {
+        keys[rightId] = lkey;
+        keys[leftId]  = rkey;
+        vals[rightId] = lval;
+        vals[leftId]  = rval;
+    }
+}
+
+/////////////////////// Selection sort ////////////////////////////
+//kernel is ported from Bolt library:
+//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+__kernel
+    void selectionSortLocal
+    (
+        __global K_T * keys,
+        __global V_T * vals,
+        const int count,
+        __local  K_T * scratch
+    )
+{
+    int          i  = get_local_id(0); // index in workgroup
+    int numOfGroups = get_num_groups(0); // index in workgroup
+    int groupID     = get_group_id(0);
+    int         wg  = get_local_size(0); // workgroup size = block size
+    int n; // number of elements to be processed for this work group
+
+    int offset   = groupID * wg;
+    int same     = 0;
+    
+    vals      += offset;
+    keys      += offset;
+    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
+
+    int clamped_i= min(i, n - 1);
+
+    K_T key1 = keys[clamped_i], key2;
+    V_T val1 = vals[clamped_i];
+    scratch[i] = key1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(i >= n)
+    {
+        return;
+    }
+
+    int pos = 0;
+    for (int j=0;j<n;++j)
+    {
+        key2  = scratch[j];
+        if(my_comp(key2, key1)) 
+            pos++;//calculate the rank of this element in this work group
+        else 
+        {
+            if(my_comp(key1, key2))
+                continue;
+            else 
+            {
+                // key1 and key2 are same
+                same++;
+            }
+        }
+    }
+    for (int j=0; j< same; j++)
+    {
+        vals[pos + j] = val1;
+        keys[pos + j] = key1;
+    }
+}
+__kernel
+    void selectionSortFinal
+    (
+        __global K_T * keys,
+        __global V_T * vals,
+        const int count
+    )
+{
+    const int          i  = get_local_id(0); // index in workgroup
+    const int numOfGroups = get_num_groups(0); // index in workgroup
+    const int groupID     = get_group_id(0);
+    const int         wg  = get_local_size(0); // workgroup size = block size
+    int pos = 0, same = 0;
+    const int offset = get_group_id(0) * wg;
+    const int remainder = count - wg*(numOfGroups-1);
+
+    if((offset + i ) >= count)
+        return;
+    V_T val1 = vals[offset + i];
+
+    K_T key1 = keys[offset + i];
+    K_T key2;
+
+    for(int j=0; j<numOfGroups-1; j++ )
+    {
+        for(int k=0; k<wg; k++)
+        {
+            key2 = keys[j*wg + k]; 
+            if(my_comp(key1, key2))
+                break;
+            else
+            {
+                //Increment only if the value is not the same. 
+                if(my_comp(key2, key1))
+                    pos++;
+                else 
+                    same++;
+            }
+        }
+    }
+
+    for(int k=0; k<remainder; k++)
+    {
+        key2 = keys[(numOfGroups-1)*wg + k]; 
+        if(my_comp(key1, key2))
+            break;
+        else
+        {
+            //Don't increment if the value is the same. 
+            if(my_comp(key2, key1))
+                pos++;
+            else 
+                same++;
+        }
+    }  
+    for (int j=0; j< same; j++)
+    {
+        vals[pos + j] = val1;
+        keys[pos + j] = key1;
+    }
+}
--- a/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
+++ b/modules/ocl/src/opencl/kernel_stablesort_by_key.cl
@@ -0,0 +1,296 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef K_T
+#define K_T float
+#endif
+
+#ifndef V_T
+#define V_T float
+#endif
+
+#ifndef IS_GT
+#define IS_GT false
+#endif
+
+#if IS_GT
+#define my_comp(x,y) ((x) > (y))
+#else
+#define my_comp(x,y) ((x) < (y))
+#endif
+
+///////////// parallel merge sort ///////////////
+// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
+uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
+{
+    //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
+    uint firstIndex = left;
+    uint lastIndex = right;
+
+    //  This loops through [firstIndex, lastIndex)
+    //  Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
+    //  this while loop will be divergent within a wavefront
+    while( firstIndex < lastIndex )
+    {
+        K_T dataVal = data[ firstIndex ];
+
+        //  This branch will create divergent wavefronts
+        if( my_comp( dataVal, searchVal ) )
+        {
+            firstIndex = firstIndex+1;
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    return firstIndex;
+}
+
+//  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
+//  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
+//  passed as a functor parameter my_comp
+//  This function returns an index that is the first index whos value would be equal to the searched value
+uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
+{
+    //  The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
+    uint firstIndex = left;
+    uint lastIndex = right;
+
+    //  This loops through [firstIndex, lastIndex)
+    //  Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
+    //  this while loop will be divergent within a wavefront
+    while( firstIndex < lastIndex )
+    {
+        //  midIndex is the average of first and last, rounded down
+        uint midIndex = ( firstIndex + lastIndex ) / 2;
+        K_T midValue = data[ midIndex ];
+
+        //  This branch will create divergent wavefronts
+        if( my_comp( midValue, searchVal ) )
+        {
+            firstIndex = midIndex+1;
+            // printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex );
+        }
+        else
+        {
+            lastIndex = midIndex;
+            // printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex );
+        }
+    }
+
+    return firstIndex;
+}
+
+//  This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
+//  by a base pointer and left and right index for a particular candidate value.  The comparison operator is
+//  passed as a functor parameter my_comp
+//  This function returns an index that is the first index whos value would be greater than the searched value
+//  If the search value is not found in the sequence, upperbound returns the same result as lowerbound
+uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
+{
+    uint upperBound = lowerBoundBinary( data, left, right, searchVal );
+
+    // printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound );
+    //  If upperBound == right, then  searchVal was not found in the sequence.  Just return.
+    if( upperBound != right )
+    {
+        //  While the values are equal i.e. !(x < y) && !(y < x) increment the index
+        K_T upperValue = data[ upperBound ];
+        while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) )
+        {
+            upperBound++;
+            upperValue = data[ upperBound ];
+        }
+    }
+
+    return upperBound;
+}
+
+//  This kernel implements merging of blocks of sorted data.  The input to this kernel most likely is
+//  the output of blockInsertionSortTemplate.  It is expected that the source array contains multiple
+//  blocks, each block is independently sorted.  The goal is to write into the output buffer half as
+//  many blocks, of double the size.  The even and odd blocks are stably merged together to form
+//  a new sorted block of twice the size.  The algorithm is out-of-place.
+kernel void merge(
+    global K_T*   iKey_ptr,
+    global V_T*   iValue_ptr,
+    global K_T*   oKey_ptr,
+    global V_T*   oValue_ptr,
+    const uint    srcVecSize,
+    const uint    srcLogicalBlockSize,
+    local K_T*    key_lds,
+    local V_T*    val_lds
+)
+{
+    size_t globalID     = get_global_id( 0 );
+    size_t groupID      = get_group_id( 0 );
+    size_t localID      = get_local_id( 0 );
+    size_t wgSize       = get_local_size( 0 );
+
+    //  Abort threads that are passed the end of the input vector
+    if( globalID >= srcVecSize )
+        return; // on SI this doesn't mess-up barriers
+
+    //  For an element in sequence A, find the lowerbound index for it in sequence B
+    uint srcBlockNum   = globalID / srcLogicalBlockSize;
+    uint srcBlockIndex = globalID % srcLogicalBlockSize;
+
+    // printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex );
+
+    //  Pairs of even-odd blocks will be merged together
+    //  An even block should search for an insertion point in the next odd block,
+    //  and the odd block should look for an insertion point in the corresponding previous even block
+    uint dstLogicalBlockSize = srcLogicalBlockSize<<1;
+    uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 );
+    leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize;
+    leftBlockIndex = min( leftBlockIndex, srcVecSize );
+    uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize );
+
+    // if( localID == 0 )
+    // {
+    // printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex );
+    // }
+
+    //  For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex
+    // uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex;
+    uint insertionIndex = 0;
+    if( (srcBlockNum & 0x1) == 0 )
+    {
+        insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
+    }
+    else
+    {
+        insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
+    }
+
+    //  The index of an element in the result sequence is the summation of it's indixes in the two input
+    //  sequences
+    uint dstBlockIndex = srcBlockIndex + insertionIndex;
+    uint dstBlockNum = srcBlockNum/2;
+
+    // if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 )
+    // {
+    // printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex );
+    // printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] );
+    // }
+    oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ];
+    oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ];
+    // printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex );
+}
+
+kernel void blockInsertionSort(
+    global K_T*   key_ptr,
+    global V_T*   value_ptr,
+    const uint    vecSize,
+    local K_T*    key_lds,
+    local V_T*    val_lds
+)
+{
+    size_t gloId    = get_global_id( 0 );
+    size_t groId    = get_group_id( 0 );
+    size_t locId    = get_local_id( 0 );
+    size_t wgSize   = get_local_size( 0 );
+
+    bool in_range = gloId < vecSize;
+    K_T key;
+    V_T val;
+    //  Abort threads that are passed the end of the input vector
+    if (in_range)
+    {
+        //  Make a copy of the entire input array into fast local memory
+        key = key_ptr[ gloId ];
+        val = value_ptr[ gloId ];
+        key_lds[ locId ] = key;
+        val_lds[ locId ] = val;
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+    //  Sorts a workgroup using a naive insertion sort
+    //  The sort uses one thread within a workgroup to sort the entire workgroup
+    if( locId == 0 && in_range )
+    {
+        //  The last workgroup may have an irregular size, so we calculate a per-block endIndex
+        //  endIndex is essentially emulating a mod operator with subtraction and multiply
+        size_t endIndex = vecSize - ( groId * wgSize );
+        endIndex = min( endIndex, wgSize );
+
+        // printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
+
+        //  Indices are signed because the while loop will generate a -1 index inside of the max function
+        for( int currIndex = 1; currIndex < endIndex; ++currIndex )
+        {
+            key = key_lds[ currIndex ];
+            val = val_lds[ currIndex ];
+            int scanIndex = currIndex;
+            K_T ldsKey = key_lds[scanIndex - 1];
+            while( scanIndex > 0 && my_comp( key, ldsKey ) )
+            {
+                V_T ldsVal = val_lds[scanIndex - 1];
+
+                //  If the keys are being swapped, make sure the values are swapped identicaly
+                key_lds[ scanIndex ] = ldsKey;
+                val_lds[ scanIndex ] = ldsVal;
+
+                scanIndex = scanIndex - 1;
+                ldsKey = key_lds[ max( 0, scanIndex - 1 ) ];  // scanIndex-1 may be -1
+            }
+            key_lds[ scanIndex ] = key;
+            val_lds[ scanIndex ] = val;
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if(in_range)
+    {
+        key = key_lds[ locId ];
+        key_ptr[ gloId ] = key;
+
+        val = val_lds[ locId ];
+        value_ptr[ gloId ] = val;
+    }
+}
+
+///////////// Radix sort from b40c library /////////////
--- a/modules/ocl/src/sort_by_key.cpp
+++ b/modules/ocl/src/sort_by_key.cpp
@@ -0,0 +1,454 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iomanip>
+#include "precomp.hpp"
+
+namespace cv
+{
+namespace ocl
+{
+
+extern const char * kernel_sort_by_key;
+extern const char * kernel_stablesort_by_key;
+extern const char * kernel_radix_sort_by_key;
+
+void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);
+
+//TODO(pengx17): change this value depending on device other than a constant
+const static unsigned int GROUP_SIZE = 256;
+
+const char * depth_strings[] =
+{
+    "uchar",   //CV_8U
+    "char",    //CV_8S
+    "ushort",  //CV_16U
+    "short",   //CV_16S
+    "int",     //CV_32S
+    "float",   //CV_32F
+    "double"   //CV_64F
+};
+
+void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)
+{
+    sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",
+            isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);
+    if(vals.oclchannels() > 1)
+    {
+        sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels());
+    }
+}
+inline bool isSizePowerOf2(size_t size)
+{
+    return ((size - 1) & (size)) == 0;
+}
+
+namespace bitonic_sort
+{
+static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
+{
+    CV_Assert(isSizePowerOf2(vecSize));
+
+    Context * cxt = Context::getContext();
+    size_t globalThreads[3] = {vecSize / 2, 1, 1};
+    size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+    // 2^numStages should be equal to vecSize or the output is invalid
+    int numStages = 0;
+    for(int i = vecSize; i > 1; i >>= 1)
+    {
+        ++numStages;
+    }
+    char build_opt_buf [100];
+    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+    const int argc = 5;
+    std::vector< std::pair<size_t, const void *> > args(argc);
+    String kernelname = "bitonicSort";
+
+    args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
+    args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
+    args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);
+
+    for(int stage = 0; stage < numStages; ++stage)
+    {
+        args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
+        for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
+        {
+            args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
+            openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
+        }
+    }
+}
+}  /* bitonic_sort */
+
+namespace selection_sort
+{
+// FIXME:
+// This function cannot sort arrays with duplicated keys
+static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
+{
+    CV_Error(-1, "This function is incorrect at the moment.");
+    Context * cxt = Context::getContext();
+
+    size_t globalThreads[3] = {vecSize, 1, 1};
+    size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+    std::vector< std::pair<size_t, const void *> > args;
+    char build_opt_buf [100];
+    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+
+    //local
+    String kernelname = "selectionSortLocal";
+    int lds_size = GROUP_SIZE * keys.elemSize();
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
+    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
+    args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
+    args.push_back(std::make_pair(lds_size,       (void*)NULL));
+
+    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
+
+    //final
+    kernelname = "selectionSortFinal";
+    args.pop_back();
+    openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
+}
+
+}  /* selection_sort */
+
+
+namespace radix_sort
+{
+//FIXME(pengx17): 
+// exclusive scan, need to be optimized as this is too naive...
+//void naive_scan_addition(oclMat& input, oclMat& output)
+//{
+//    Context * cxt = Context::getContext();
+//    size_t vecSize = input.cols;
+//    size_t globalThreads[3] = {1, 1, 1};
+//    size_t localThreads[3]  = {1, 1, 1};
+//
+//    String kernelname = "naiveScanAddition";
+//
+//    std::vector< std::pair<size_t, const void *> > args;
+//    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
+//    args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
+//    args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
+//    openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
+//}
+
+void static naive_scan_addition_cpu(oclMat& input, oclMat& output)
+{
+    Mat m_input = input, m_output(output.size(), output.type());
+    MatIterator_<int> i_mit = m_input.begin<int>();
+    MatIterator_<int> o_mit = m_output.begin<int>();
+    *o_mit = 0;
+    ++i_mit;
+    ++o_mit;
+    for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)
+    {
+        *o_mit = *(o_mit - 1) + *(i_mit - 1);
+    }
+    output = m_output;
+}
+
+
+//radix sort ported from Bolt
+static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)
+{
+    CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
+
+    bool isKeyFloat = keys.type() == CV_32F;
+
+    const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
+                         //local array of 16 elements in the histogram kernel.
+    const int RADICES = (1 << RADIX); //Values handeled by each work-item?
+
+    bool  newBuffer = false;
+    size_t vecSize = origVecSize;
+
+    unsigned int groupSize  = RADICES;
+
+    size_t mulFactor = groupSize * RADICES;
+
+    oclMat buffer_keys, buffer_vals;
+
+    if(origVecSize % mulFactor != 0)
+    {
+        vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;
+        buffer_keys.create(1, vecSize, keys.type());
+        buffer_vals.create(1, vecSize, vals.type());
+        Scalar padding_value;
+        oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));
+
+        if(isGreaterThan)
+        {
+            switch(buffer_keys.depth())
+            {
+            case CV_32F:
+                padding_value = Scalar::all(-FLT_MAX);
+                break;
+            case CV_32S:
+                padding_value = Scalar::all(INT_MIN);
+                break;
+            }
+        }
+        else
+        {
+            switch(buffer_keys.depth())
+            {
+            case CV_32F:
+                padding_value = Scalar::all(FLT_MAX);
+                break;
+            case CV_32S:
+                padding_value = Scalar::all(INT_MAX);
+                break;
+            }
+        }
+        ocl::copyMakeBorder(
+            keys(Rect(0,0,origVecSize,1)), buffer_keys, 
+            0, 0, 0, vecSize - origVecSize, 
+            BORDER_CONSTANT, padding_value);
+        vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);
+        newBuffer = true;
+    }
+    else
+    {
+        buffer_keys = keys;
+        buffer_vals = vals;
+        newBuffer = false;
+    }
+    oclMat swap_input_keys(1, vecSize, keys.type());
+    oclMat swap_input_vals(1, vecSize, vals.type());
+    oclMat hist_bin_keys(1, vecSize, CV_32SC1);
+    oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);
+
+    Context * cxt = Context::getContext();
+
+    size_t globalThreads[3] = {vecSize / RADICES, 1, 1};
+    size_t localThreads[3]  = {groupSize, 1, 1};
+
+    std::vector< std::pair<size_t, const void *> > args;
+    char build_opt_buf [100];
+    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+
+    //additional build option for radix sort
+    sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT"); 
+
+    String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};
+
+    int swap = 0;
+    for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX)
+    {
+        args.clear();
+        //Do a histogram pass locally
+        if(swap == 0)
+        {
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
+        }
+        else
+        {
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
+        }
+        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));
+        args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
+        openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,
+            args, -1, -1, build_opt_buf);
+
+        args.clear();
+        //Perform a global scan
+        naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);
+        // end of scan
+        if(swap == 0)
+        {
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
+        }
+        else
+        {
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
+        }
+        args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));
+        args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
+
+        if(swap == 0)
+        {
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
+        }
+        else
+        {
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
+            args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
+        }
+        openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,
+            args, -1, -1, build_opt_buf);
+        swap = swap ? 0 : 1;
+    }
+    if(newBuffer)
+    {
+        buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);
+        buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);
+    }
+}
+
+}  /* radix_sort */
+
+namespace merge_sort
+{
+static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
+{
+    Context * cxt = Context::getContext();
+
+    size_t globalThreads[3] = {vecSize, 1, 1};
+    size_t localThreads[3]  = {GROUP_SIZE, 1, 1};
+
+    std::vector< std::pair<size_t, const void *> > args;
+    char build_opt_buf [100];
+    genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
+
+    String kernelname[] = {String("blockInsertionSort"), String("merge")};
+    int keylds_size = GROUP_SIZE * keys.elemSize();
+    int vallds_size = GROUP_SIZE * vals.elemSize();
+    args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&keys.data));
+    args.push_back(std::make_pair(sizeof(cl_mem),  (void *)&vals.data));
+    args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));
+    args.push_back(std::make_pair(keylds_size,     (void*)NULL));
+    args.push_back(std::make_pair(vallds_size,     (void*)NULL));
+
+    openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);
+
+    //  Early exit for the case of no merge passes, values are already in destination vector
+    if(vecSize <= GROUP_SIZE)
+    {
+        return;
+    }
+
+    //  An odd number of elements requires an extra merge pass to sort
+    size_t numMerges = 0;
+    //  Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
+    //  this is how many merge passes we want
+    size_t log2BlockSize = vecSize >> 6;
+    for( ; log2BlockSize > 1; log2BlockSize >>= 1 )
+    {
+        ++numMerges;
+    }
+    //  Check to see if the input vector size is a power of 2, if not we will need last merge pass
+    numMerges += isSizePowerOf2(vecSize)? 1: 0;
+
+    //  Allocate a flipflop buffer because the merge passes are out of place
+    oclMat tmpKeyBuffer(keys.size(), keys.type());
+    oclMat tmpValBuffer(vals.size(), vals.type());
+    args.resize(8);
+
+    args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);
+    args[6] = std::make_pair(keylds_size,    (void*)NULL);
+    args[7] = std::make_pair(vallds_size,    (void*)NULL);
+
+    for(size_t pass = 1; pass <= numMerges; ++pass )
+    {
+        //  For each pass, flip the input-output buffers
+        if( pass & 0x1 )
+        {
+            args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
+            args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
+            args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
+            args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
+        }
+        else
+        {
+            args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
+            args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
+            args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
+            args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
+        }
+        //  For each pass, the merge window doubles
+        unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );
+        args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);
+        openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);
+    }
+    //  If there are an odd number of merges, then the output data is sitting in the temp buffer.  We need to copy
+    //  the results back into the input array
+    if( numMerges & 1 )
+    {
+        tmpKeyBuffer.copyTo(keys);
+        tmpValBuffer.copyTo(vals);
+    }
+}
+}  /* merge_sort */
+
+}
+} /* namespace cv { namespace ocl */
+
+
+void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)
+{
+    CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
+    CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
+    CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );
+    switch(method)
+    {
+    case SORT_BITONIC:
+        bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
+        break;
+    case SORT_SELECTION:
+        selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
+        break;
+    case SORT_RADIX:
+        radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
+        break;
+    case SORT_MERGE:
+        merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
+        break;
+    }
+}
+
+void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)
+{
+    CV_Assert( keys.size() == vals.size() );
+    CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
+    size_t vecSize = static_cast<size_t>(keys.cols);
+    sortByKey(keys, vals, vecSize, method, isGreaterThan);
+}