[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further
This commit is contained in:
parent
4378f398c7
commit
ebc3043c86
@ -451,7 +451,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <NcvBool tbCacheTextureIImg,
|
template <NcvBool tbCacheTextureIImg,
|
||||||
NcvBool tbCacheTextureCascade,
|
NcvBool tbCacheTextureCascade,
|
||||||
NcvBool tbDoAtomicCompaction>
|
NcvBool tbDoAtomicCompaction>
|
||||||
|
@ -64,40 +64,56 @@ static T divUp(T a, T b)
|
|||||||
template<typename T>
|
template<typename T>
|
||||||
struct functorAddValues
|
struct functorAddValues
|
||||||
{
|
{
|
||||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||||
|
{
|
||||||
|
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||||
|
*dst = *src;
|
||||||
|
}
|
||||||
|
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||||
{
|
{
|
||||||
in1out += in2;
|
in1out += in2;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct functorMinValues
|
struct functorMinValues
|
||||||
{
|
{
|
||||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||||
{
|
{
|
||||||
|
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||||
|
*dst = *src;
|
||||||
|
}
|
||||||
|
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||||
|
{
|
||||||
in1out = in1out > in2 ? in2 : in1out;
|
in1out = in1out > in2 ? in2 : in1out;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
struct functorMaxValues
|
struct functorMaxValues
|
||||||
{
|
{
|
||||||
static __device__ __inline__ void reduce(T &in1out, T &in2)
|
static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
|
||||||
{
|
{
|
||||||
|
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
|
||||||
|
*dst = *src;
|
||||||
|
}
|
||||||
|
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
|
||||||
|
{
|
||||||
in1out = in1out > in2 ? in1out : in2;
|
in1out = in1out > in2 ? in1out : in2;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<typename Tdata, class Tfunc, Ncv32u nThreads>
|
template<typename Tdata, class Tfunc, Ncv32u nThreads>
|
||||||
static __device__ Tdata subReduce(Tdata threadElem)
|
static __device__ Tdata subReduce(Tdata threadElem)
|
||||||
{
|
{
|
||||||
Tfunc functor;
|
Tfunc functor;
|
||||||
|
|
||||||
__shared__ Tdata reduceArr[nThreads];
|
__shared__ Tdata _reduceArr[nThreads];
|
||||||
reduceArr[threadIdx.x] = threadElem;
|
volatile Tdata *reduceArr = _reduceArr;
|
||||||
|
functor.assign(reduceArr + threadIdx.x, &threadElem);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (nThreads >= 256 && threadIdx.x < 128)
|
if (nThreads >= 256 && threadIdx.x < 128)
|
||||||
@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem)
|
|||||||
{
|
{
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
|
||||||
}
|
}
|
||||||
if (nThreads >= 32)
|
if (nThreads >= 32 && threadIdx.x < 16)
|
||||||
{
|
{
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
|
||||||
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
|
||||||
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
|
||||||
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
|
||||||
|
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
|
||||||
}
|
}
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
|
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
|
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
|
|
||||||
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
return reduceArr[0];
|
Tdata reduceRes;
|
||||||
|
functor.assign(&reduceRes, reduceArr);
|
||||||
|
return reduceRes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user