[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further
This commit is contained in:
parent
4378f398c7
commit
ebc3043c86
@ -451,7 +451,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <NcvBool tbCacheTextureIImg,
|
||||
NcvBool tbCacheTextureCascade,
|
||||
NcvBool tbDoAtomicCompaction>
|
||||
|
@ -64,40 +64,56 @@ static T divUp(T a, T b)
|
||||
template<typename T>
|
||||
struct functorAddValues
|
||||
{
|
||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
||||
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||
{
|
||||
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||
*dst = *src;
|
||||
}
|
||||
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||
{
|
||||
in1out += in2;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename T>
|
||||
struct functorMinValues
|
||||
{
|
||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
||||
{
|
||||
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||
{
|
||||
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||
*dst = *src;
|
||||
}
|
||||
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||
{
|
||||
in1out = in1out > in2 ? in2 : in1out;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename T>
|
||||
struct functorMaxValues
|
||||
{
|
||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
||||
{
|
||||
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||
{
|
||||
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||
*dst = *src;
|
||||
}
|
||||
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||
{
|
||||
in1out = in1out > in2 ? in1out : in2;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename Tdata, class Tfunc, Ncv32u nThreads>
|
||||
static __device__ Tdata subReduce(Tdata threadElem)
|
||||
{
|
||||
Tfunc functor;
|
||||
|
||||
__shared__ Tdata reduceArr[nThreads];
|
||||
reduceArr[threadIdx.x] = threadElem;
|
||||
__shared__ Tdata _reduceArr[nThreads];
|
||||
volatile Tdata *reduceArr = _reduceArr;
|
||||
functor.assign(reduceArr + threadIdx.x, &threadElem);
|
||||
__syncthreads();
|
||||
|
||||
if (nThreads >= 256 && threadIdx.x < 128)
|
||||
@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem)
|
||||
{
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
|
||||
}
|
||||
if (nThreads >= 32)
|
||||
if (nThreads >= 32 && threadIdx.x < 16)
|
||||
{
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
|
||||
}
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
|
||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
return reduceArr[0];
|
||||
Tdata reduceRes;
|
||||
functor.assign(&reduceRes, reduceArr);
|
||||
return reduceRes;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user