[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further
This commit is contained in:
parent
4378f398c7
commit
ebc3043c86
@ -64,7 +64,12 @@ static T divUp(T a, T b)
|
|||||||
template<typename T>
|
template<typename T>
|
||||||
struct functorAddValues
|
struct functorAddValues
|
||||||
{
|
{
|
||||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||||
|
{
|
||||||
|
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||||
|
*dst = *src;
|
||||||
|
}
|
||||||
|
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||||
{
|
{
|
||||||
in1out += in2;
|
in1out += in2;
|
||||||
}
|
}
|
||||||
@ -74,7 +79,12 @@ struct functorAddValues
|
|||||||
template<typename T>
|
template<typename T>
|
||||||
struct functorMinValues
|
struct functorMinValues
|
||||||
{
|
{
|
||||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||||
|
{
|
||||||
|
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||||
|
*dst = *src;
|
||||||
|
}
|
||||||
|
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||||
{
|
{
|
||||||
in1out = in1out > in2 ? in2 : in1out;
|
in1out = in1out > in2 ? in2 : in1out;
|
||||||
}
|
}
|
||||||
@ -84,7 +94,12 @@ struct functorMinValues
|
|||||||
template<typename T>
|
template<typename T>
|
||||||
struct functorMaxValues
|
struct functorMaxValues
|
||||||
{
|
{
|
||||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||||
|
{
|
||||||
|
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||||
|
*dst = *src;
|
||||||
|
}
|
||||||
|
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||||
{
|
{
|
||||||
in1out = in1out > in2 ? in1out : in2;
|
in1out = in1out > in2 ? in1out : in2;
|
||||||
}
|
}
|
||||||
@ -96,8 +111,9 @@ static __device__ Tdata subReduce(Tdata threadElem)
|
|||||||
{
|
{
|
||||||
Tfunc functor;
|
Tfunc functor;
|
||||||
|
|
||||||
__shared__ Tdata reduceArr[nThreads];
|
__shared__ Tdata _reduceArr[nThreads];
|
||||||
reduceArr[threadIdx.x] = threadElem;
|
volatile Tdata *reduceArr = _reduceArr;
|
||||||
|
functor.assign(reduceArr + threadIdx.x, &threadElem);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (nThreads >= 256 && threadIdx.x < 128)
|
if (nThreads >= 256 && threadIdx.x < 128)
|
||||||
@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem)
|
|||||||
{
|
{
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
|
||||||
}
|
}
|
||||||
if (nThreads >= 32)
|
if (nThreads >= 32 && threadIdx.x < 16)
|
||||||
{
|
{
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
|
||||||
}
|
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
return reduceArr[0];
|
Tdata reduceRes;
|
||||||
|
functor.assign(&reduceRes, reduceArr);
|
||||||
|
return reduceRes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user