This commit is contained in:
Vladislav Vinogradov 2012-03-28 14:25:41 +00:00
parent de27d3e023
commit 5aae21c037
3 changed files with 84 additions and 137 deletions

View File

@ -77,110 +77,52 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE and size is power of 2 //assuming size <= WARP_SIZE and size is power of 2
//template <class T> __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
//inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
//{
// Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
// s_Data[pos] = 0;
// pos += K_WARP_SIZE;
// s_Data[pos] = idata;
//
// s_Data[pos] += s_Data[pos - 1];
// s_Data[pos] += s_Data[pos - 2];
// s_Data[pos] += s_Data[pos - 4];
// s_Data[pos] += s_Data[pos - 8];
// s_Data[pos] += s_Data[pos - 16];
//
// return s_Data[pos];
//}
//template <class T>
//inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
//{
// return warpScanInclusive(idata, s_Data) - idata;
//}
//
//
//template <class T, Ncv32u tiNumScanThreads>
//inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
//{
// if (tiNumScanThreads > K_WARP_SIZE)
// {
// //Bottom-level inclusive warp scan
// T warpResult = warpScanInclusive(idata, s_Data);
//
// //Save top elements of each warp for exclusive warp scan
// //sync to wait for warp scans to complete (because s_Data is being overwritten)
// __syncthreads();
// if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
// {
// s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
// }
//
// //wait for warp scans to complete
// __syncthreads();
//
// if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
// {
// //grab top warp elements
// T val = s_Data[threadIdx.x];
// //calculate exclusive scan and write back to shared memory
// s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
// }
//
// //return updated warp scans with exclusive scan results
// __syncthreads();
// return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
// }
// else
// {
// return warpScanInclusive(idata, s_Data);
// }
//}
template <Ncv32u size>
__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data)
{ {
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0; s_Data[pos] = 0;
pos += size; pos += K_WARP_SIZE;
s_Data[pos] = idata; s_Data[pos] = idata;
for(Ncv32u offset = 1; offset < size; offset <<= 1) s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - offset]; s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
s_Data[pos] += s_Data[pos - 8];
s_Data[pos] += s_Data[pos - 16];
return s_Data[pos]; return s_Data[pos];
} }
template <Ncv32u size> __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
__forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{ {
return warpScanInclusive<size>(idata, s_Data) - idata; return warpScanInclusive(idata, s_Data) - idata;
} }
template <Ncv32u size, Ncv32u tiNumScanThreads> template <Ncv32u tiNumScanThreads>
__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data) __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
{ {
if(size > K_WARP_SIZE) if (tiNumScanThreads > K_WARP_SIZE)
{ {
//Bottom-level inclusive warp scan //Bottom-level inclusive warp scan
Ncv32u warpResult = warpScanInclusive<K_WARP_SIZE>(idata, s_Data); Ncv32u warpResult = warpScanInclusive(idata, s_Data);
//Save top elements of each warp for exclusive warp scan //Save top elements of each warp for exclusive warp scan
//sync to wait for warp scans to complete (because s_Data is being overwritten) //sync to wait for warp scans to complete (because s_Data is being overwritten)
__syncthreads(); __syncthreads();
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
{
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
}
//wait for warp scans to complete //wait for warp scans to complete
__syncthreads(); __syncthreads();
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
{ {
//grab top warp elements //grab top warp elements
Ncv32u val = s_Data[threadIdx.x]; Ncv32u val = s_Data[threadIdx.x];
//calculate exclsive scan and write back to shared memory //calculate exclusive scan and write back to shared memory
s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data); s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
} }
//return updated warp scans with exclusive scan results //return updated warp scans with exclusive scan results
@ -189,7 +131,7 @@ __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
} }
else else
{ {
return warpScanInclusive<size>(idata, s_Data); return warpScanInclusive(idata, s_Data);
} }
} }
@ -295,7 +237,7 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
__shared__ Ncv32u numPassed; __shared__ Ncv32u numPassed;
__shared__ Ncv32u outMaskOffset; __shared__ Ncv32u outMaskOffset;
Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL, NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem); Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
__syncthreads(); __syncthreads();
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1) if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
@ -391,11 +333,14 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
NcvBool bPass = true; NcvBool bPass = true;
if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
{ {
Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs]; Ncv32f pixelStdDev = 0.0f;
for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++) if (!bInactiveThread)
pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
{ {
Ncv32f curStageSum = 0.0f; Ncv32f curStageSum = 0.0f;
@ -409,6 +354,8 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
NcvBool bMoreNodesToTraverse = true; NcvBool bMoreNodesToTraverse = true;
Ncv32u iNode = curRootNodeOffset; Ncv32u iNode = curRootNodeOffset;
if (bPass && !bInactiveThread)
{
while (bMoreNodesToTraverse) while (bMoreNodesToTraverse)
{ {
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes); HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
@ -436,11 +383,11 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) - getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg); getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
#else #else
curNodeVal += (Ncv32f)rectSum * rectWeight; curNodeVal += (Ncv32f)rectSum * rectWeight;
#endif #endif
} }
HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
@ -472,6 +419,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
iNode = nextNodeDescriptor.getNextNodeOffset(); iNode = nextNodeDescriptor.getNextNodeOffset();
} }
} }
}
__syncthreads(); __syncthreads();
curRootNodeOffset++; curRootNodeOffset++;
@ -481,7 +429,6 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
{ {
bPass = false; bPass = false;
outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U; outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
break;
} }
} }
} }
@ -1100,7 +1047,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
NcvBool bTexCacheCascade = devProp.major < 2; NcvBool bTexCacheCascade = devProp.major < 2;
NcvBool bTexCacheIImg = true; //this works better even on Fermi so far NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3); NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask; NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp; NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;

View File

@ -116,7 +116,7 @@ int main(int argc, char** argv)
TS::ptr()->init("gpu"); TS::ptr()->init("gpu");
InitGoogleTest(&argc, argv); InitGoogleTest(&argc, argv);
const char* keys ="{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level }"; const char* keys ="{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }";
CommandLineParser parser(argc, (const char**)argv, keys); CommandLineParser parser(argc, (const char**)argv, keys);

View File

@ -84,7 +84,7 @@ struct NVidiaTest : TestWithParam<cv::gpu::DeviceInfo>
struct NPPST : NVidiaTest {}; struct NPPST : NVidiaTest {};
struct NCV : NVidiaTest {}; struct NCV : NVidiaTest {};
OutputLevel nvidiaTestOutputLevel = OutputLevelNone; OutputLevel nvidiaTestOutputLevel = OutputLevelCompact;
TEST_P(NPPST, Integral) TEST_P(NPPST, Integral)
{ {