Merge pull request #1189 from pengx17:2.4_sort_by_key
This commit is contained in:
176
modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
Normal file
176
modules/ocl/src/opencl/kernel_radix_sort_by_key.cl
Normal file
@@ -0,0 +1,176 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
|
||||
|
||||
#ifndef N // number of radices
|
||||
#define N 4
|
||||
#endif
|
||||
|
||||
#ifndef K_T
|
||||
#define K_T float
|
||||
#endif
|
||||
|
||||
#ifndef V_T
|
||||
#define V_T float
|
||||
#endif
|
||||
|
||||
#ifndef IS_GT
|
||||
#define IS_GT 0
|
||||
#endif
|
||||
|
||||
|
||||
// from Thrust::b40c, link:
|
||||
// https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/detail/b40c/radixsort_key_conversion.h
|
||||
__inline uint convertKey(uint converted_key)
|
||||
{
|
||||
#ifdef K_FLT
|
||||
unsigned int mask = (converted_key & 0x80000000) ? 0xffffffff : 0x80000000;
|
||||
converted_key ^= mask;
|
||||
#elif defined(K_INT)
|
||||
const uint SIGN_MASK = 1u << ((sizeof(int) * 8) - 1);
|
||||
converted_key ^= SIGN_MASK;
|
||||
#else
|
||||
|
||||
#endif
|
||||
return converted_key;
|
||||
}
|
||||
|
||||
//FIXME(pengx17):
|
||||
// exclusive scan, need to be optimized as this is too naive...
|
||||
kernel
|
||||
void naiveScanAddition(
|
||||
__global int * input,
|
||||
__global int * output,
|
||||
int size
|
||||
)
|
||||
{
|
||||
if(get_global_id(0) == 0)
|
||||
{
|
||||
output[0] = 0;
|
||||
for(int i = 1; i < size; i ++)
|
||||
{
|
||||
output[i] = output[i - 1] + input[i - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// following is ported from
|
||||
// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_uint_kernels.cl
|
||||
kernel
|
||||
void histogramRadixN (
|
||||
__global K_T* unsortedKeys,
|
||||
__global int * buckets,
|
||||
uint shiftCount
|
||||
)
|
||||
{
|
||||
const int RADIX_T = N;
|
||||
const int RADICES_T = (1 << RADIX_T);
|
||||
const int NUM_OF_ELEMENTS_PER_WORK_ITEM_T = RADICES_T;
|
||||
const int MASK_T = (1 << RADIX_T) - 1;
|
||||
int localBuckets[16] = {0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0};
|
||||
int globalId = get_global_id(0);
|
||||
int numOfGroups = get_num_groups(0);
|
||||
|
||||
/* Calculate thread-histograms */
|
||||
for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
|
||||
{
|
||||
uint value = convertKey(as_uint(unsortedKeys[mad24(globalId, NUM_OF_ELEMENTS_PER_WORK_ITEM_T, i)]));
|
||||
value = (value >> shiftCount) & MASK_T;
|
||||
#if IS_GT
|
||||
localBuckets[RADICES_T - value - 1]++;
|
||||
#else
|
||||
localBuckets[value]++;
|
||||
#endif
|
||||
}
|
||||
|
||||
for(int i = 0; i < NUM_OF_ELEMENTS_PER_WORK_ITEM_T; ++i)
|
||||
{
|
||||
buckets[mad24(i, RADICES_T * numOfGroups, globalId) ] = localBuckets[i];
|
||||
}
|
||||
}
|
||||
|
||||
kernel
|
||||
void permuteRadixN (
|
||||
__global K_T* unsortedKeys,
|
||||
__global V_T* unsortedVals,
|
||||
__global int* scanedBuckets,
|
||||
uint shiftCount,
|
||||
__global K_T* sortedKeys,
|
||||
__global V_T* sortedVals
|
||||
)
|
||||
{
|
||||
const int RADIX_T = N;
|
||||
const int RADICES_T = (1 << RADIX_T);
|
||||
const int MASK_T = (1<<RADIX_T) -1;
|
||||
|
||||
int globalId = get_global_id(0);
|
||||
int numOfGroups = get_num_groups(0);
|
||||
const int NUM_OF_ELEMENTS_PER_WORK_GROUP_T = numOfGroups << N;
|
||||
int localIndex[16];
|
||||
|
||||
/*Load the index to local memory*/
|
||||
for(int i = 0; i < RADICES_T; ++i)
|
||||
{
|
||||
#if IS_GT
|
||||
localIndex[i] = scanedBuckets[mad24(RADICES_T - i - 1, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
|
||||
#else
|
||||
localIndex[i] = scanedBuckets[mad24(i, NUM_OF_ELEMENTS_PER_WORK_GROUP_T, globalId)];
|
||||
#endif
|
||||
}
|
||||
/* Permute elements to appropriate location */
|
||||
for(int i = 0; i < RADICES_T; ++i)
|
||||
{
|
||||
int old_idx = mad24(globalId, RADICES_T, i);
|
||||
K_T ovalue = unsortedKeys[old_idx];
|
||||
uint value = convertKey(as_uint(ovalue));
|
||||
uint maskedValue = (value >> shiftCount) & MASK_T;
|
||||
uint index = localIndex[maskedValue];
|
||||
sortedKeys[index] = ovalue;
|
||||
sortedVals[index] = unsortedVals[old_idx];
|
||||
localIndex[maskedValue] = index + 1;
|
||||
}
|
||||
}
|
||||
245
modules/ocl/src/opencl/kernel_sort_by_key.cl
Normal file
245
modules/ocl/src/opencl/kernel_sort_by_key.cl
Normal file
@@ -0,0 +1,245 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef K_T
|
||||
#define K_T float
|
||||
#endif
|
||||
|
||||
#ifndef V_T
|
||||
#define V_T float
|
||||
#endif
|
||||
|
||||
#ifndef IS_GT
|
||||
#define IS_GT false
|
||||
#endif
|
||||
|
||||
#if IS_GT
|
||||
#define my_comp(x,y) ((x) > (y))
|
||||
#else
|
||||
#define my_comp(x,y) ((x) < (y))
|
||||
#endif
|
||||
|
||||
/////////////////////// Bitonic sort ////////////////////////////
|
||||
// ported from
|
||||
// https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_by_key_kernels.cl
|
||||
__kernel
|
||||
void bitonicSort
|
||||
(
|
||||
__global K_T * keys,
|
||||
__global V_T * vals,
|
||||
int count,
|
||||
int stage,
|
||||
int passOfStage
|
||||
)
|
||||
{
|
||||
const int threadId = get_global_id(0);
|
||||
if(threadId >= count / 2)
|
||||
{
|
||||
return;
|
||||
}
|
||||
const int pairDistance = 1 << (stage - passOfStage);
|
||||
const int blockWidth = 2 * pairDistance;
|
||||
|
||||
int leftId = min( (threadId % pairDistance)
|
||||
+ (threadId / pairDistance) * blockWidth, count );
|
||||
|
||||
int rightId = min( leftId + pairDistance, count );
|
||||
|
||||
int temp;
|
||||
|
||||
const V_T lval = vals[leftId];
|
||||
const V_T rval = vals[rightId];
|
||||
|
||||
const K_T lkey = keys[leftId];
|
||||
const K_T rkey = keys[rightId];
|
||||
|
||||
int sameDirectionBlockWidth = 1 << stage;
|
||||
|
||||
if((threadId/sameDirectionBlockWidth) % 2 == 1)
|
||||
{
|
||||
temp = rightId;
|
||||
rightId = leftId;
|
||||
leftId = temp;
|
||||
}
|
||||
|
||||
const bool compareResult = my_comp(lkey, rkey);
|
||||
|
||||
if(compareResult)
|
||||
{
|
||||
keys[rightId] = rkey;
|
||||
keys[leftId] = lkey;
|
||||
vals[rightId] = rval;
|
||||
vals[leftId] = lval;
|
||||
}
|
||||
else
|
||||
{
|
||||
keys[rightId] = lkey;
|
||||
keys[leftId] = rkey;
|
||||
vals[rightId] = lval;
|
||||
vals[leftId] = rval;
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////// Selection sort ////////////////////////////
|
||||
//kernel is ported from Bolt library:
|
||||
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
|
||||
__kernel
|
||||
void selectionSortLocal
|
||||
(
|
||||
__global K_T * keys,
|
||||
__global V_T * vals,
|
||||
const int count,
|
||||
__local K_T * scratch
|
||||
)
|
||||
{
|
||||
int i = get_local_id(0); // index in workgroup
|
||||
int numOfGroups = get_num_groups(0); // index in workgroup
|
||||
int groupID = get_group_id(0);
|
||||
int wg = get_local_size(0); // workgroup size = block size
|
||||
int n; // number of elements to be processed for this work group
|
||||
|
||||
int offset = groupID * wg;
|
||||
int same = 0;
|
||||
|
||||
vals += offset;
|
||||
keys += offset;
|
||||
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
|
||||
|
||||
int clamped_i= min(i, n - 1);
|
||||
|
||||
K_T key1 = keys[clamped_i], key2;
|
||||
V_T val1 = vals[clamped_i];
|
||||
scratch[i] = key1;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(i >= n)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
int pos = 0;
|
||||
for (int j=0;j<n;++j)
|
||||
{
|
||||
key2 = scratch[j];
|
||||
if(my_comp(key2, key1))
|
||||
pos++;//calculate the rank of this element in this work group
|
||||
else
|
||||
{
|
||||
if(my_comp(key1, key2))
|
||||
continue;
|
||||
else
|
||||
{
|
||||
// key1 and key2 are same
|
||||
same++;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j< same; j++)
|
||||
{
|
||||
vals[pos + j] = val1;
|
||||
keys[pos + j] = key1;
|
||||
}
|
||||
}
|
||||
__kernel
|
||||
void selectionSortFinal
|
||||
(
|
||||
__global K_T * keys,
|
||||
__global V_T * vals,
|
||||
const int count
|
||||
)
|
||||
{
|
||||
const int i = get_local_id(0); // index in workgroup
|
||||
const int numOfGroups = get_num_groups(0); // index in workgroup
|
||||
const int groupID = get_group_id(0);
|
||||
const int wg = get_local_size(0); // workgroup size = block size
|
||||
int pos = 0, same = 0;
|
||||
const int offset = get_group_id(0) * wg;
|
||||
const int remainder = count - wg*(numOfGroups-1);
|
||||
|
||||
if((offset + i ) >= count)
|
||||
return;
|
||||
V_T val1 = vals[offset + i];
|
||||
|
||||
K_T key1 = keys[offset + i];
|
||||
K_T key2;
|
||||
|
||||
for(int j=0; j<numOfGroups-1; j++ )
|
||||
{
|
||||
for(int k=0; k<wg; k++)
|
||||
{
|
||||
key2 = keys[j*wg + k];
|
||||
if(my_comp(key1, key2))
|
||||
break;
|
||||
else
|
||||
{
|
||||
//Increment only if the value is not the same.
|
||||
if(my_comp(key2, key1))
|
||||
pos++;
|
||||
else
|
||||
same++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int k=0; k<remainder; k++)
|
||||
{
|
||||
key2 = keys[(numOfGroups-1)*wg + k];
|
||||
if(my_comp(key1, key2))
|
||||
break;
|
||||
else
|
||||
{
|
||||
//Don't increment if the value is the same.
|
||||
if(my_comp(key2, key1))
|
||||
pos++;
|
||||
else
|
||||
same++;
|
||||
}
|
||||
}
|
||||
for (int j=0; j< same; j++)
|
||||
{
|
||||
vals[pos + j] = val1;
|
||||
keys[pos + j] = key1;
|
||||
}
|
||||
}
|
||||
296
modules/ocl/src/opencl/kernel_stablesort_by_key.cl
Normal file
296
modules/ocl/src/opencl/kernel_stablesort_by_key.cl
Normal file
@@ -0,0 +1,296 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef K_T
|
||||
#define K_T float
|
||||
#endif
|
||||
|
||||
#ifndef V_T
|
||||
#define V_T float
|
||||
#endif
|
||||
|
||||
#ifndef IS_GT
|
||||
#define IS_GT false
|
||||
#endif
|
||||
|
||||
#if IS_GT
|
||||
#define my_comp(x,y) ((x) > (y))
|
||||
#else
|
||||
#define my_comp(x,y) ((x) < (y))
|
||||
#endif
|
||||
|
||||
///////////// parallel merge sort ///////////////
|
||||
// ported from https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/stablesort_by_key_kernels.cl
|
||||
uint lowerBoundLinear( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
{
|
||||
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
|
||||
uint firstIndex = left;
|
||||
uint lastIndex = right;
|
||||
|
||||
// This loops through [firstIndex, lastIndex)
|
||||
// Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
|
||||
// this while loop will be divergent within a wavefront
|
||||
while( firstIndex < lastIndex )
|
||||
{
|
||||
K_T dataVal = data[ firstIndex ];
|
||||
|
||||
// This branch will create divergent wavefronts
|
||||
if( my_comp( dataVal, searchVal ) )
|
||||
{
|
||||
firstIndex = firstIndex+1;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return firstIndex;
|
||||
}
|
||||
|
||||
// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
|
||||
// by a base pointer and left and right index for a particular candidate value. The comparison operator is
|
||||
// passed as a functor parameter my_comp
|
||||
// This function returns an index that is the first index whos value would be equal to the searched value
|
||||
uint lowerBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
{
|
||||
// The values firstIndex and lastIndex get modified within the loop, narrowing down the potential sequence
|
||||
uint firstIndex = left;
|
||||
uint lastIndex = right;
|
||||
|
||||
// This loops through [firstIndex, lastIndex)
|
||||
// Since firstIndex and lastIndex will be different for every thread depending on the nested branch,
|
||||
// this while loop will be divergent within a wavefront
|
||||
while( firstIndex < lastIndex )
|
||||
{
|
||||
// midIndex is the average of first and last, rounded down
|
||||
uint midIndex = ( firstIndex + lastIndex ) / 2;
|
||||
K_T midValue = data[ midIndex ];
|
||||
|
||||
// This branch will create divergent wavefronts
|
||||
if( my_comp( midValue, searchVal ) )
|
||||
{
|
||||
firstIndex = midIndex+1;
|
||||
// printf( "lowerBound: lastIndex[ %i ]=%i\n", get_local_id( 0 ), lastIndex );
|
||||
}
|
||||
else
|
||||
{
|
||||
lastIndex = midIndex;
|
||||
// printf( "lowerBound: firstIndex[ %i ]=%i\n", get_local_id( 0 ), firstIndex );
|
||||
}
|
||||
}
|
||||
|
||||
return firstIndex;
|
||||
}
|
||||
|
||||
// This implements a binary search routine to look for an 'insertion point' in a sequence, denoted
|
||||
// by a base pointer and left and right index for a particular candidate value. The comparison operator is
|
||||
// passed as a functor parameter my_comp
|
||||
// This function returns an index that is the first index whos value would be greater than the searched value
|
||||
// If the search value is not found in the sequence, upperbound returns the same result as lowerbound
|
||||
uint upperBoundBinary( global K_T* data, uint left, uint right, K_T searchVal)
|
||||
{
|
||||
uint upperBound = lowerBoundBinary( data, left, right, searchVal );
|
||||
|
||||
// printf( "upperBoundBinary: upperBound[ %i, %i ]= %i\n", left, right, upperBound );
|
||||
// If upperBound == right, then searchVal was not found in the sequence. Just return.
|
||||
if( upperBound != right )
|
||||
{
|
||||
// While the values are equal i.e. !(x < y) && !(y < x) increment the index
|
||||
K_T upperValue = data[ upperBound ];
|
||||
while( !my_comp( upperValue, searchVal ) && !my_comp( searchVal, upperValue) && (upperBound != right) )
|
||||
{
|
||||
upperBound++;
|
||||
upperValue = data[ upperBound ];
|
||||
}
|
||||
}
|
||||
|
||||
return upperBound;
|
||||
}
|
||||
|
||||
// This kernel implements merging of blocks of sorted data. The input to this kernel most likely is
|
||||
// the output of blockInsertionSortTemplate. It is expected that the source array contains multiple
|
||||
// blocks, each block is independently sorted. The goal is to write into the output buffer half as
|
||||
// many blocks, of double the size. The even and odd blocks are stably merged together to form
|
||||
// a new sorted block of twice the size. The algorithm is out-of-place.
|
||||
kernel void merge(
|
||||
global K_T* iKey_ptr,
|
||||
global V_T* iValue_ptr,
|
||||
global K_T* oKey_ptr,
|
||||
global V_T* oValue_ptr,
|
||||
const uint srcVecSize,
|
||||
const uint srcLogicalBlockSize,
|
||||
local K_T* key_lds,
|
||||
local V_T* val_lds
|
||||
)
|
||||
{
|
||||
size_t globalID = get_global_id( 0 );
|
||||
size_t groupID = get_group_id( 0 );
|
||||
size_t localID = get_local_id( 0 );
|
||||
size_t wgSize = get_local_size( 0 );
|
||||
|
||||
// Abort threads that are passed the end of the input vector
|
||||
if( globalID >= srcVecSize )
|
||||
return; // on SI this doesn't mess-up barriers
|
||||
|
||||
// For an element in sequence A, find the lowerbound index for it in sequence B
|
||||
uint srcBlockNum = globalID / srcLogicalBlockSize;
|
||||
uint srcBlockIndex = globalID % srcLogicalBlockSize;
|
||||
|
||||
// printf( "mergeTemplate: srcBlockNum[%i]=%i\n", srcBlockNum, srcBlockIndex );
|
||||
|
||||
// Pairs of even-odd blocks will be merged together
|
||||
// An even block should search for an insertion point in the next odd block,
|
||||
// and the odd block should look for an insertion point in the corresponding previous even block
|
||||
uint dstLogicalBlockSize = srcLogicalBlockSize<<1;
|
||||
uint leftBlockIndex = globalID & ~((dstLogicalBlockSize) - 1 );
|
||||
leftBlockIndex += (srcBlockNum & 0x1) ? 0 : srcLogicalBlockSize;
|
||||
leftBlockIndex = min( leftBlockIndex, srcVecSize );
|
||||
uint rightBlockIndex = min( leftBlockIndex + srcLogicalBlockSize, srcVecSize );
|
||||
|
||||
// if( localID == 0 )
|
||||
// {
|
||||
// printf( "mergeTemplate: wavefront[ %i ] logicalBlock[ %i ] logicalIndex[ %i ] leftBlockIndex[ %i ] <=> rightBlockIndex[ %i ]\n", groupID, srcBlockNum, srcBlockIndex, leftBlockIndex, rightBlockIndex );
|
||||
// }
|
||||
|
||||
// For a particular element in the input array, find the lowerbound index for it in the search sequence given by leftBlockIndex & rightBlockIndex
|
||||
// uint insertionIndex = lowerBoundLinear( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ], my_comp ) - leftBlockIndex;
|
||||
uint insertionIndex = 0;
|
||||
if( (srcBlockNum & 0x1) == 0 )
|
||||
{
|
||||
insertionIndex = lowerBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
|
||||
}
|
||||
else
|
||||
{
|
||||
insertionIndex = upperBoundBinary( iKey_ptr, leftBlockIndex, rightBlockIndex, iKey_ptr[ globalID ] ) - leftBlockIndex;
|
||||
}
|
||||
|
||||
// The index of an element in the result sequence is the summation of it's indixes in the two input
|
||||
// sequences
|
||||
uint dstBlockIndex = srcBlockIndex + insertionIndex;
|
||||
uint dstBlockNum = srcBlockNum/2;
|
||||
|
||||
// if( (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex == 395 )
|
||||
// {
|
||||
// printf( "mergeTemplate: (dstBlockNum[ %i ] * dstLogicalBlockSize[ %i ]) + dstBlockIndex[ %i ] = srcBlockIndex[ %i ] + insertionIndex[ %i ]\n", dstBlockNum, dstLogicalBlockSize, dstBlockIndex, srcBlockIndex, insertionIndex );
|
||||
// printf( "mergeTemplate: dstBlockIndex[ %i ] = iKey_ptr[ %i ] ( %i )\n", (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex, globalID, iKey_ptr[ globalID ] );
|
||||
// }
|
||||
oKey_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iKey_ptr[ globalID ];
|
||||
oValue_ptr[ (dstBlockNum*dstLogicalBlockSize)+dstBlockIndex ] = iValue_ptr[ globalID ];
|
||||
// printf( "mergeTemplate: leftResultIndex[ %i ]=%i + %i\n", leftResultIndex, srcBlockIndex, leftInsertionIndex );
|
||||
}
|
||||
|
||||
kernel void blockInsertionSort(
|
||||
global K_T* key_ptr,
|
||||
global V_T* value_ptr,
|
||||
const uint vecSize,
|
||||
local K_T* key_lds,
|
||||
local V_T* val_lds
|
||||
)
|
||||
{
|
||||
size_t gloId = get_global_id( 0 );
|
||||
size_t groId = get_group_id( 0 );
|
||||
size_t locId = get_local_id( 0 );
|
||||
size_t wgSize = get_local_size( 0 );
|
||||
|
||||
bool in_range = gloId < vecSize;
|
||||
K_T key;
|
||||
V_T val;
|
||||
// Abort threads that are passed the end of the input vector
|
||||
if (in_range)
|
||||
{
|
||||
// Make a copy of the entire input array into fast local memory
|
||||
key = key_ptr[ gloId ];
|
||||
val = value_ptr[ gloId ];
|
||||
key_lds[ locId ] = key;
|
||||
val_lds[ locId ] = val;
|
||||
}
|
||||
barrier( CLK_LOCAL_MEM_FENCE );
|
||||
// Sorts a workgroup using a naive insertion sort
|
||||
// The sort uses one thread within a workgroup to sort the entire workgroup
|
||||
if( locId == 0 && in_range )
|
||||
{
|
||||
// The last workgroup may have an irregular size, so we calculate a per-block endIndex
|
||||
// endIndex is essentially emulating a mod operator with subtraction and multiply
|
||||
size_t endIndex = vecSize - ( groId * wgSize );
|
||||
endIndex = min( endIndex, wgSize );
|
||||
|
||||
// printf( "Debug: endIndex[%i]=%i\n", groId, endIndex );
|
||||
|
||||
// Indices are signed because the while loop will generate a -1 index inside of the max function
|
||||
for( int currIndex = 1; currIndex < endIndex; ++currIndex )
|
||||
{
|
||||
key = key_lds[ currIndex ];
|
||||
val = val_lds[ currIndex ];
|
||||
int scanIndex = currIndex;
|
||||
K_T ldsKey = key_lds[scanIndex - 1];
|
||||
while( scanIndex > 0 && my_comp( key, ldsKey ) )
|
||||
{
|
||||
V_T ldsVal = val_lds[scanIndex - 1];
|
||||
|
||||
// If the keys are being swapped, make sure the values are swapped identicaly
|
||||
key_lds[ scanIndex ] = ldsKey;
|
||||
val_lds[ scanIndex ] = ldsVal;
|
||||
|
||||
scanIndex = scanIndex - 1;
|
||||
ldsKey = key_lds[ max( 0, scanIndex - 1 ) ]; // scanIndex-1 may be -1
|
||||
}
|
||||
key_lds[ scanIndex ] = key;
|
||||
val_lds[ scanIndex ] = val;
|
||||
}
|
||||
}
|
||||
barrier( CLK_LOCAL_MEM_FENCE );
|
||||
|
||||
if(in_range)
|
||||
{
|
||||
key = key_lds[ locId ];
|
||||
key_ptr[ gloId ] = key;
|
||||
|
||||
val = val_lds[ locId ];
|
||||
value_ptr[ gloId ] = val;
|
||||
}
|
||||
}
|
||||
|
||||
///////////// Radix sort from b40c library /////////////
|
||||
454
modules/ocl/src/sort_by_key.cpp
Normal file
454
modules/ocl/src/sort_by_key.cpp
Normal file
@@ -0,0 +1,454 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Peng Xiao, pengxiao@outlook.com
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors as is and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include <iomanip>
|
||||
#include "precomp.hpp"
|
||||
|
||||
namespace cv
|
||||
{
|
||||
namespace ocl
|
||||
{
|
||||
|
||||
extern const char * kernel_sort_by_key;
|
||||
extern const char * kernel_stablesort_by_key;
|
||||
extern const char * kernel_radix_sort_by_key;
|
||||
|
||||
void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan);
|
||||
|
||||
//TODO(pengx17): change this value depending on device other than a constant
|
||||
const static unsigned int GROUP_SIZE = 256;
|
||||
|
||||
const char * depth_strings[] =
|
||||
{
|
||||
"uchar", //CV_8U
|
||||
"char", //CV_8S
|
||||
"ushort", //CV_16U
|
||||
"short", //CV_16S
|
||||
"int", //CV_32S
|
||||
"float", //CV_32F
|
||||
"double" //CV_64F
|
||||
};
|
||||
|
||||
void static genSortBuildOption(const oclMat& keys, const oclMat& vals, bool isGreaterThan, char * build_opt_buf)
|
||||
{
|
||||
sprintf(build_opt_buf, "-D IS_GT=%d -D K_T=%s -D V_T=%s",
|
||||
isGreaterThan?1:0, depth_strings[keys.depth()], depth_strings[vals.depth()]);
|
||||
if(vals.oclchannels() > 1)
|
||||
{
|
||||
sprintf( build_opt_buf + strlen(build_opt_buf), "%d", vals.oclchannels());
|
||||
}
|
||||
}
|
||||
inline bool isSizePowerOf2(size_t size)
|
||||
{
|
||||
return ((size - 1) & (size)) == 0;
|
||||
}
|
||||
|
||||
namespace bitonic_sort
|
||||
{
|
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
|
||||
{
|
||||
CV_Assert(isSizePowerOf2(vecSize));
|
||||
|
||||
Context * cxt = Context::getContext();
|
||||
size_t globalThreads[3] = {vecSize / 2, 1, 1};
|
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
|
||||
|
||||
// 2^numStages should be equal to vecSize or the output is invalid
|
||||
int numStages = 0;
|
||||
for(int i = vecSize; i > 1; i >>= 1)
|
||||
{
|
||||
++numStages;
|
||||
}
|
||||
char build_opt_buf [100];
|
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
|
||||
const int argc = 5;
|
||||
std::vector< std::pair<size_t, const void *> > args(argc);
|
||||
String kernelname = "bitonicSort";
|
||||
|
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
|
||||
args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
|
||||
args[2] = std::make_pair(sizeof(cl_int), (void *)&vecSize);
|
||||
|
||||
for(int stage = 0; stage < numStages; ++stage)
|
||||
{
|
||||
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
|
||||
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
|
||||
{
|
||||
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
|
||||
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
} /* bitonic_sort */
|
||||
|
||||
namespace selection_sort
|
||||
{
|
||||
// FIXME:
|
||||
// This function cannot sort arrays with duplicated keys
|
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
|
||||
{
|
||||
CV_Error(-1, "This function is incorrect at the moment.");
|
||||
Context * cxt = Context::getContext();
|
||||
|
||||
size_t globalThreads[3] = {vecSize, 1, 1};
|
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
char build_opt_buf [100];
|
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
|
||||
|
||||
//local
|
||||
String kernelname = "selectionSortLocal";
|
||||
int lds_size = GROUP_SIZE * keys.elemSize();
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
|
||||
args.push_back(std::make_pair(lds_size, (void*)NULL));
|
||||
|
||||
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
|
||||
|
||||
//final
|
||||
kernelname = "selectionSortFinal";
|
||||
args.pop_back();
|
||||
openCLExecuteKernel(cxt, &kernel_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1, build_opt_buf);
|
||||
}
|
||||
|
||||
} /* selection_sort */
|
||||
|
||||
|
||||
namespace radix_sort
|
||||
{
|
||||
//FIXME(pengx17):
|
||||
// exclusive scan, need to be optimized as this is too naive...
|
||||
//void naive_scan_addition(oclMat& input, oclMat& output)
|
||||
//{
|
||||
// Context * cxt = Context::getContext();
|
||||
// size_t vecSize = input.cols;
|
||||
// size_t globalThreads[3] = {1, 1, 1};
|
||||
// size_t localThreads[3] = {1, 1, 1};
|
||||
//
|
||||
// String kernelname = "naiveScanAddition";
|
||||
//
|
||||
// std::vector< std::pair<size_t, const void *> > args;
|
||||
// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&input.data));
|
||||
// args.push_back(std::make_pair(sizeof(cl_mem), (void *)&output.data));
|
||||
// args.push_back(std::make_pair(sizeof(cl_int), (void *)&vecSize));
|
||||
// openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelname, globalThreads, localThreads, args, -1, -1);
|
||||
//}
|
||||
|
||||
void static naive_scan_addition_cpu(oclMat& input, oclMat& output)
|
||||
{
|
||||
Mat m_input = input, m_output(output.size(), output.type());
|
||||
MatIterator_<int> i_mit = m_input.begin<int>();
|
||||
MatIterator_<int> o_mit = m_output.begin<int>();
|
||||
*o_mit = 0;
|
||||
++i_mit;
|
||||
++o_mit;
|
||||
for(; i_mit != m_input.end<int>(); ++i_mit, ++o_mit)
|
||||
{
|
||||
*o_mit = *(o_mit - 1) + *(i_mit - 1);
|
||||
}
|
||||
output = m_output;
|
||||
}
|
||||
|
||||
|
||||
//radix sort ported from Bolt
|
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t origVecSize, bool isGreaterThan)
|
||||
{
|
||||
CV_Assert(keys.depth() == CV_32S || keys.depth() == CV_32F); // we assume keys are 4 bytes
|
||||
|
||||
bool isKeyFloat = keys.type() == CV_32F;
|
||||
|
||||
const int RADIX = 4; //Now you cannot replace this with Radix 8 since there is a
|
||||
//local array of 16 elements in the histogram kernel.
|
||||
const int RADICES = (1 << RADIX); //Values handeled by each work-item?
|
||||
|
||||
bool newBuffer = false;
|
||||
size_t vecSize = origVecSize;
|
||||
|
||||
unsigned int groupSize = RADICES;
|
||||
|
||||
size_t mulFactor = groupSize * RADICES;
|
||||
|
||||
oclMat buffer_keys, buffer_vals;
|
||||
|
||||
if(origVecSize % mulFactor != 0)
|
||||
{
|
||||
vecSize = ((vecSize + mulFactor) / mulFactor) * mulFactor;
|
||||
buffer_keys.create(1, vecSize, keys.type());
|
||||
buffer_vals.create(1, vecSize, vals.type());
|
||||
Scalar padding_value;
|
||||
oclMat roi_buffer_vals = buffer_vals(Rect(0,0,origVecSize,1));
|
||||
|
||||
if(isGreaterThan)
|
||||
{
|
||||
switch(buffer_keys.depth())
|
||||
{
|
||||
case CV_32F:
|
||||
padding_value = Scalar::all(-FLT_MAX);
|
||||
break;
|
||||
case CV_32S:
|
||||
padding_value = Scalar::all(INT_MIN);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
switch(buffer_keys.depth())
|
||||
{
|
||||
case CV_32F:
|
||||
padding_value = Scalar::all(FLT_MAX);
|
||||
break;
|
||||
case CV_32S:
|
||||
padding_value = Scalar::all(INT_MAX);
|
||||
break;
|
||||
}
|
||||
}
|
||||
ocl::copyMakeBorder(
|
||||
keys(Rect(0,0,origVecSize,1)), buffer_keys,
|
||||
0, 0, 0, vecSize - origVecSize,
|
||||
BORDER_CONSTANT, padding_value);
|
||||
vals(Rect(0,0,origVecSize,1)).copyTo(roi_buffer_vals);
|
||||
newBuffer = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
buffer_keys = keys;
|
||||
buffer_vals = vals;
|
||||
newBuffer = false;
|
||||
}
|
||||
oclMat swap_input_keys(1, vecSize, keys.type());
|
||||
oclMat swap_input_vals(1, vecSize, vals.type());
|
||||
oclMat hist_bin_keys(1, vecSize, CV_32SC1);
|
||||
oclMat hist_bin_dest_keys(1, vecSize, CV_32SC1);
|
||||
|
||||
Context * cxt = Context::getContext();
|
||||
|
||||
size_t globalThreads[3] = {vecSize / RADICES, 1, 1};
|
||||
size_t localThreads[3] = {groupSize, 1, 1};
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
char build_opt_buf [100];
|
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
|
||||
|
||||
//additional build option for radix sort
|
||||
sprintf(build_opt_buf + strlen(build_opt_buf), " -D K_%s", isKeyFloat?"FLT":"INT");
|
||||
|
||||
String kernelnames[2] = {String("histogramRadixN"), String("permuteRadixN")};
|
||||
|
||||
int swap = 0;
|
||||
for(int bits = 0; bits < (static_cast<int>(keys.elemSize()) * 8); bits += RADIX)
|
||||
{
|
||||
args.clear();
|
||||
//Do a histogram pass locally
|
||||
if(swap == 0)
|
||||
{
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
|
||||
}
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
|
||||
openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[0], globalThreads, localThreads,
|
||||
args, -1, -1, build_opt_buf);
|
||||
|
||||
args.clear();
|
||||
//Perform a global scan
|
||||
naive_scan_addition_cpu(hist_bin_keys, hist_bin_dest_keys);
|
||||
// end of scan
|
||||
if(swap == 0)
|
||||
{
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
|
||||
}
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&hist_bin_dest_keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_int), (void *)&bits));
|
||||
|
||||
if(swap == 0)
|
||||
{
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&swap_input_vals.data));
|
||||
}
|
||||
else
|
||||
{
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&buffer_vals.data));
|
||||
}
|
||||
openCLExecuteKernel(cxt, &kernel_radix_sort_by_key, kernelnames[1], globalThreads, localThreads,
|
||||
args, -1, -1, build_opt_buf);
|
||||
swap = swap ? 0 : 1;
|
||||
}
|
||||
if(newBuffer)
|
||||
{
|
||||
buffer_keys(Rect(0,0,origVecSize,1)).copyTo(keys);
|
||||
buffer_vals(Rect(0,0,origVecSize,1)).copyTo(vals);
|
||||
}
|
||||
}
|
||||
|
||||
} /* radix_sort */
|
||||
|
||||
namespace merge_sort
|
||||
{
|
||||
static void sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, bool isGreaterThan)
|
||||
{
|
||||
Context * cxt = Context::getContext();
|
||||
|
||||
size_t globalThreads[3] = {vecSize, 1, 1};
|
||||
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
|
||||
|
||||
std::vector< std::pair<size_t, const void *> > args;
|
||||
char build_opt_buf [100];
|
||||
genSortBuildOption(keys, vals, isGreaterThan, build_opt_buf);
|
||||
|
||||
String kernelname[] = {String("blockInsertionSort"), String("merge")};
|
||||
int keylds_size = GROUP_SIZE * keys.elemSize();
|
||||
int vallds_size = GROUP_SIZE * vals.elemSize();
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&keys.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_mem), (void *)&vals.data));
|
||||
args.push_back(std::make_pair(sizeof(cl_uint), (void *)&vecSize));
|
||||
args.push_back(std::make_pair(keylds_size, (void*)NULL));
|
||||
args.push_back(std::make_pair(vallds_size, (void*)NULL));
|
||||
|
||||
openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[0], globalThreads, localThreads, args, -1, -1, build_opt_buf);
|
||||
|
||||
// Early exit for the case of no merge passes, values are already in destination vector
|
||||
if(vecSize <= GROUP_SIZE)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// An odd number of elements requires an extra merge pass to sort
|
||||
size_t numMerges = 0;
|
||||
// Calculate the log2 of vecSize, taking into acvecSize our block size from kernel 1 is 64
|
||||
// this is how many merge passes we want
|
||||
size_t log2BlockSize = vecSize >> 6;
|
||||
for( ; log2BlockSize > 1; log2BlockSize >>= 1 )
|
||||
{
|
||||
++numMerges;
|
||||
}
|
||||
// Check to see if the input vector size is a power of 2, if not we will need last merge pass
|
||||
numMerges += isSizePowerOf2(vecSize)? 1: 0;
|
||||
|
||||
// Allocate a flipflop buffer because the merge passes are out of place
|
||||
oclMat tmpKeyBuffer(keys.size(), keys.type());
|
||||
oclMat tmpValBuffer(vals.size(), vals.type());
|
||||
args.resize(8);
|
||||
|
||||
args[4] = std::make_pair(sizeof(cl_uint), (void *)&vecSize);
|
||||
args[6] = std::make_pair(keylds_size, (void*)NULL);
|
||||
args[7] = std::make_pair(vallds_size, (void*)NULL);
|
||||
|
||||
for(size_t pass = 1; pass <= numMerges; ++pass )
|
||||
{
|
||||
// For each pass, flip the input-output buffers
|
||||
if( pass & 0x1 )
|
||||
{
|
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
|
||||
args[1] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
|
||||
args[2] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
|
||||
args[3] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
|
||||
}
|
||||
else
|
||||
{
|
||||
args[0] = std::make_pair(sizeof(cl_mem), (void *)&tmpKeyBuffer.data);
|
||||
args[1] = std::make_pair(sizeof(cl_mem), (void *)&tmpValBuffer.data);
|
||||
args[2] = std::make_pair(sizeof(cl_mem), (void *)&keys.data);
|
||||
args[3] = std::make_pair(sizeof(cl_mem), (void *)&vals.data);
|
||||
}
|
||||
// For each pass, the merge window doubles
|
||||
unsigned int srcLogicalBlockSize = static_cast<unsigned int>( localThreads[0] << (pass-1) );
|
||||
args[5] = std::make_pair(sizeof(cl_uint), (void *)&srcLogicalBlockSize);
|
||||
openCLExecuteKernel(cxt, &kernel_stablesort_by_key, kernelname[1], globalThreads, localThreads, args, -1, -1, build_opt_buf);
|
||||
}
|
||||
// If there are an odd number of merges, then the output data is sitting in the temp buffer. We need to copy
|
||||
// the results back into the input array
|
||||
if( numMerges & 1 )
|
||||
{
|
||||
tmpKeyBuffer.copyTo(keys);
|
||||
tmpValBuffer.copyTo(vals);
|
||||
}
|
||||
}
|
||||
} /* merge_sort */
|
||||
|
||||
}
|
||||
} /* namespace cv { namespace ocl */
|
||||
|
||||
|
||||
void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, size_t vecSize, int method, bool isGreaterThan)
|
||||
{
|
||||
CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
|
||||
CV_Assert( keys.channels() == 1 ); // we only allow one channel keys
|
||||
CV_Assert( vecSize <= static_cast<size_t>(keys.cols) );
|
||||
switch(method)
|
||||
{
|
||||
case SORT_BITONIC:
|
||||
bitonic_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
|
||||
break;
|
||||
case SORT_SELECTION:
|
||||
selection_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
|
||||
break;
|
||||
case SORT_RADIX:
|
||||
radix_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
|
||||
break;
|
||||
case SORT_MERGE:
|
||||
merge_sort::sortByKey(keys, vals, vecSize, isGreaterThan);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cv::ocl::sortByKey(oclMat& keys, oclMat& vals, int method, bool isGreaterThan)
|
||||
{
|
||||
CV_Assert( keys.size() == vals.size() );
|
||||
CV_Assert( keys.rows == 1 ); // we only allow one dimensional input
|
||||
size_t vecSize = static_cast<size_t>(keys.cols);
|
||||
sortByKey(keys, vals, vecSize, method, isGreaterThan);
|
||||
}
|
||||
Reference in New Issue
Block a user