fixed bug #1640

2012-03-28 14:25:41 +00:00 · 2012-03-28 14:25:41 +00:00 · 5aae21c037
commit 5aae21c037
parent de27d3e023
3 changed files with 84 additions and 137 deletions
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@ -77,110 +77,52 @@ NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of th
 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
 //assuming size <= WARP_SIZE and size is power of 2
-//template <class T>
+__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
 //inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
 //{
 //    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
 //    s_Data[pos] = 0;
 //    pos += K_WARP_SIZE;
 //    s_Data[pos] = idata;
 //
 //    s_Data[pos] += s_Data[pos - 1];
 //    s_Data[pos] += s_Data[pos - 2];
 //    s_Data[pos] += s_Data[pos - 4];
 //    s_Data[pos] += s_Data[pos - 8];
 //    s_Data[pos] += s_Data[pos - 16];
 //
 //    return s_Data[pos];
 //}
 //template <class T>
 //inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
 //{
 //    return warpScanInclusive(idata, s_Data) - idata;
 //}
 //
 //
 //template <class T, Ncv32u tiNumScanThreads>
 //inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
 //{
 //    if (tiNumScanThreads > K_WARP_SIZE)
 //    {
 //        //Bottom-level inclusive warp scan
 //        T warpResult = warpScanInclusive(idata, s_Data);
 //
 //        //Save top elements of each warp for exclusive warp scan
 //        //sync to wait for warp scans to complete (because s_Data is being overwritten)
 //        __syncthreads();
 //        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
 //        {
 //            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
 //        }
 //
 //        //wait for warp scans to complete
 //        __syncthreads();
 //
 //        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
 //        {
 //            //grab top warp elements
 //            T val = s_Data[threadIdx.x];
 //            //calculate exclusive scan and write back to shared memory
 //            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
 //        }
 //
 //        //return updated warp scans with exclusive scan results
 //        __syncthreads();
 //        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
 //    }
 //    else
 //    {
 //        return warpScanInclusive(idata, s_Data);
 //    }
 //}
 template <Ncv32u size>
 __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u* s_Data)
 {
-    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
+    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
    s_Data[pos] = 0;
-    pos += size;
+    pos += K_WARP_SIZE;
    s_Data[pos] = idata;
-    for(Ncv32u offset = 1; offset < size; offset <<= 1)
+    s_Data[pos] += s_Data[pos - 1];
-        s_Data[pos] += s_Data[pos - offset];
+    s_Data[pos] += s_Data[pos - 2];
    s_Data[pos] += s_Data[pos - 4];
    s_Data[pos] += s_Data[pos - 8];
    s_Data[pos] += s_Data[pos - 16];
    return s_Data[pos];
 }
-template <Ncv32u size>
+__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
 __forceinline__ __device__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
 {
-    return warpScanInclusive<size>(idata, s_Data) - idata;
+    return warpScanInclusive(idata, s_Data) - idata;
 }
-template <Ncv32u size, Ncv32u tiNumScanThreads>
+template <Ncv32u tiNumScanThreads>
 __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
 {
-    if(size > K_WARP_SIZE)
+    if (tiNumScanThreads > K_WARP_SIZE)
    {
        //Bottom-level inclusive warp scan
-        Ncv32u warpResult = warpScanInclusive<K_WARP_SIZE>(idata, s_Data);
+        Ncv32u warpResult = warpScanInclusive(idata, s_Data);
        //Save top elements of each warp for exclusive warp scan
        //sync to wait for warp scans to complete (because s_Data is being overwritten)
        __syncthreads();
        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
        {
            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
        }
        //wait for warp scans to complete
        __syncthreads();
        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
        {
            //grab top warp elements
            Ncv32u val = s_Data[threadIdx.x];
-            //calculate exclsive scan and write back to shared memory
+            //calculate exclusive scan and write back to shared memory
-            s_Data[threadIdx.x] = warpScanExclusive<(size >> K_LOG2_WARP_SIZE)>(val, s_Data);
+            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
        }
        //return updated warp scans with exclusive scan results
@ -189,7 +131,7 @@ __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
    }
    else
    {
-        return warpScanInclusive<size>(idata, s_Data);
+        return warpScanInclusive(idata, s_Data);
    }
 }
@ -295,7 +237,7 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
    __shared__ Ncv32u numPassed;
    __shared__ Ncv32u outMaskOffset;
-    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL, NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
+    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
    __syncthreads();
    if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
@ -391,11 +333,14 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
    NcvBool bPass = true;
-    if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
+    if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
    {
-        Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
+        Ncv32f pixelStdDev = 0.0f;
-        for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
+        if (!bInactiveThread)
            pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
        for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
        {
            Ncv32f curStageSum = 0.0f;
@ -409,6 +354,8 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                NcvBool bMoreNodesToTraverse = true;
                Ncv32u iNode = curRootNodeOffset;
                if (bPass && !bInactiveThread)
                {
                    while (bMoreNodesToTraverse)
                    {
                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
@ -436,11 +383,11 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                                             getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
                                             getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
-#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
+    #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
-#else
+    #else
                        curNodeVal += (Ncv32f)rectSum * rectWeight;
-#endif
+    #endif
                        }
                        HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
@ -472,6 +419,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                            iNode = nextNodeDescriptor.getNextNodeOffset();
                        }
                    }
                }
                __syncthreads();
                curRootNodeOffset++;
@ -481,7 +429,6 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
            {
                bPass = false;
                outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
                break;
            }
        }
    }
@ -1100,7 +1047,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
    NcvBool bTexCacheCascade = devProp.major < 2;
    NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
-    NcvBool bDoAtomicCompaction = false;// devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
+    NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
    NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
    NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
--- a/modules/gpu/test/main.cpp
+++ b/modules/gpu/test/main.cpp
@ -116,7 +116,7 @@ int main(int argc, char** argv)
    TS::ptr()->init("gpu");
    InitGoogleTest(&argc, argv);
-    const char* keys ="{ nvtest_output_level | nvtest_output_level | none | NVidia test verbosity level }";
+    const char* keys ="{ nvtest_output_level | nvtest_output_level | compact | NVidia test verbosity level }";
    CommandLineParser parser(argc, (const char**)argv, keys);
--- a/modules/gpu/test/test_nvidia.cpp
+++ b/modules/gpu/test/test_nvidia.cpp
@ -84,7 +84,7 @@ struct NVidiaTest : TestWithParam<cv::gpu::DeviceInfo>
 struct NPPST : NVidiaTest {};
 struct NCV : NVidiaTest {};
-OutputLevel nvidiaTestOutputLevel = OutputLevelNone;
+OutputLevel nvidiaTestOutputLevel = OutputLevelCompact;
 TEST_P(NPPST, Integral)
 {