[*] Approach to the bug with integral image calculation on SM_2.0 (Fermi)
This commit is contained in:
@@ -71,6 +71,9 @@
|
|||||||
//==============================================================================
|
//==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
|
||||||
|
|
||||||
|
|
||||||
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
|
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
|
||||||
//assuming size <= WARP_SIZE and size is power of 2
|
//assuming size <= WARP_SIZE and size is power of 2
|
||||||
template <class T>
|
template <class T>
|
||||||
@@ -81,10 +84,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
|
|||||||
pos += K_WARP_SIZE;
|
pos += K_WARP_SIZE;
|
||||||
s_Data[pos] = idata;
|
s_Data[pos] = idata;
|
||||||
|
|
||||||
for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
|
//for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
|
||||||
{
|
//{
|
||||||
s_Data[pos] += s_Data[pos - offset];
|
// s_Data[pos] += s_Data[pos - offset];
|
||||||
}
|
//}
|
||||||
|
|
||||||
|
s_Data[pos] += s_Data[pos - 1];
|
||||||
|
s_Data[pos] += s_Data[pos - 2];
|
||||||
|
s_Data[pos] += s_Data[pos - 4];
|
||||||
|
s_Data[pos] += s_Data[pos - 8];
|
||||||
|
s_Data[pos] += s_Data[pos - 16];
|
||||||
|
|
||||||
return s_Data[pos];
|
return s_Data[pos];
|
||||||
}
|
}
|
||||||
|
@@ -82,6 +82,9 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
|
|||||||
//==============================================================================
|
//==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
|
||||||
|
|
||||||
|
|
||||||
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
|
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
|
||||||
//assuming size <= WARP_SIZE and size is power of 2
|
//assuming size <= WARP_SIZE and size is power of 2
|
||||||
template <class T>
|
template <class T>
|
||||||
@@ -92,10 +95,16 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
|
|||||||
pos += K_WARP_SIZE;
|
pos += K_WARP_SIZE;
|
||||||
s_Data[pos] = idata;
|
s_Data[pos] = idata;
|
||||||
|
|
||||||
for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
|
//for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
|
||||||
{
|
//{
|
||||||
s_Data[pos] += s_Data[pos - offset];
|
// s_Data[pos] += s_Data[pos - offset];
|
||||||
}
|
//}
|
||||||
|
|
||||||
|
s_Data[pos] += s_Data[pos - 1];
|
||||||
|
s_Data[pos] += s_Data[pos - 2];
|
||||||
|
s_Data[pos] += s_Data[pos - 4];
|
||||||
|
s_Data[pos] += s_Data[pos - 8];
|
||||||
|
s_Data[pos] += s_Data[pos - 16];
|
||||||
|
|
||||||
return s_Data[pos];
|
return s_Data[pos];
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user