initial support of GPU LBP classifier: added new style xml format loading

2012-06-22 15:00:36 +00:00
parent 02170a0a58
commit 1365e28a54
22 changed files with 446 additions and 192 deletions
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -232,7 +232,7 @@ __device__ Ncv32u d_outMaskPosition;
 __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
 {
 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
-    
+
    __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
    __shared__ Ncv32u numPassed;
    __shared__ Ncv32u outMaskOffset;
@@ -927,7 +927,7 @@ Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDes
 }


-NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
                                               NCVMatrix<Ncv32f> &d_weights,
                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
                                               Ncv32u &numDetections,
@@ -945,32 +945,41 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
                                               cudaDeviceProp &devProp,
                                               cudaStream_t cuStream)
 {
-    ncvAssertReturn(d_integralImage.memType() == d_weights.memType() &&
-                    d_integralImage.memType() == d_pixelMask.memType() &&
-                    d_integralImage.memType() == gpuAllocator.memType() &&
-                     (d_integralImage.memType() == NCVMemoryTypeDevice ||
-                      d_integralImage.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(integral.memType() == d_weights.memType()&&
+                    integral.memType() == d_pixelMask.memType() &&
+                    integral.memType() == gpuAllocator.memType() &&
+                   (integral.memType() == NCVMemoryTypeDevice ||
+                    integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
                    d_HaarStages.memType() == d_HaarFeatures.memType() &&
                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||
                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
+
    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
-    ncvAssertReturn((d_integralImage.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
+
+    ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
+
    ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
                    d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
                    d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
-                    d_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
-                    d_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
+                    integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
+                    integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
+
    ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
+
    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
                    d_HaarFeatures.length() >= haar.NumFeatures &&
                    d_HaarStages.length() == h_HaarStages.length() &&
                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
+
    ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
+
    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);

    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
@@ -979,7 +988,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag

    NCVStatus ncvStat;

-    NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, d_integralImage.width, d_integralImage.height, d_integralImage.pitch);
+    NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
    ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
    NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
    ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
@@ -997,7 +1006,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag

    ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
    ncvAssertReturnNcvStat(ncvStat);
-    ncvStat = d_integralImage.copySolid(h_integralImage, 0);
+    ncvStat = integral.copySolid(h_integralImage, 0);
    ncvAssertReturnNcvStat(ncvStat);
    ncvStat = d_weights.copySolid(h_weights, 0);
    ncvAssertReturnNcvStat(ncvStat);
@@ -1071,8 +1080,8 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
        cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();

        size_t alignmentOffset;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, d_integralImage.ptr(), cfdTexIImage,
-            (anchorsRoi.height + haar.ClassifierSize.height) * d_integralImage.pitch()), NCV_CUDA_ERROR);
+        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
+            (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
    }

@@ -1189,7 +1198,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
            grid1,
            block1,
            cuStream,
-            d_integralImage.ptr(), d_integralImage.stride(),
+            integral.ptr(), integral.stride(),
            d_weights.ptr(), d_weights.stride(),
            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
            d_ptrNowData->ptr(),
@@ -1259,7 +1268,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
            grid2,
            block2,
            cuStream,
-            d_integralImage.ptr(), d_integralImage.stride(),
+            integral.ptr(), integral.stride(),
            d_weights.ptr(), d_weights.stride(),
            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
            d_ptrNowData->ptr(),
@@ -1320,7 +1329,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
            grid3,
            block3,
            cuStream,
-            d_integralImage.ptr(), d_integralImage.stride(),
+            integral.ptr(), integral.stride(),
            d_weights.ptr(), d_weights.stride(),
            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
            d_ptrNowData->ptr(),
@@ -1455,10 +1464,14 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
                                         cudaStream_t cuStream)
 {
    ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
+
    ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
                    pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
+
    ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
+
    ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
+
    ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
                    numPixelMaskDetections <= pixelMask.length() &&
                    totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
@@ -1527,12 +1540,16 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
                    d_srcImg.memType() == gpuAllocator.memType() &&
                     (d_srcImg.memType() == NCVMemoryTypeDevice ||
                      d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
    ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
                    d_HaarStages.memType() == d_HaarFeatures.memType() &&
                     (d_HaarStages.memType() == NCVMemoryTypeDevice ||
                      d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+
    ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
+
    ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
+
    ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
                     h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
                     d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
@@ -1540,13 +1557,17 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
                    d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
                    srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
                    d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
+
    ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
+
    ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
                    d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
                    d_HaarFeatures.length() >= haar.NumFeatures &&
                    d_HaarStages.length() == h_HaarStages.length() &&
                    haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
+
    ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
+
    ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);

    //TODO: set NPP active stream to cuStream
@@ -1557,8 +1578,8 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
    Ncv32u integralWidth = d_srcImg.width() + 1;
    Ncv32u integralHeight = d_srcImg.height() + 1;

-    NCVMatrixAlloc<Ncv32u> d_integralImage(gpuAllocator, integralWidth, integralHeight);
-    ncvAssertReturn(d_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
+    ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
    NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
    ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

@@ -1589,7 +1610,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
    NCV_SKIP_COND_BEGIN

    nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
-                                       d_integralImage.ptr(), d_integralImage.pitch(),
+                                       integral.ptr(), integral.pitch(),
                                       NcvSize32u(d_srcImg.width(), d_srcImg.height()),
                                       d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
    ncvAssertReturnNcvStat(nppStat);
@@ -1676,7 +1697,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
        NCV_SKIP_COND_BEGIN

        nppStat = nppiStDecimate_32u_C1R(
-            d_integralImage.ptr(), d_integralImage.pitch(),
+            integral.ptr(), integral.pitch(),
            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
            srcIIRoi, scale, true);
        ncvAssertReturnNcvStat(nppStat);
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -1,7 +1,7 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
-// 
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
@@ -95,11 +95,6 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
    pos += K_WARP_SIZE;
    s_Data[pos] = idata;

-    //for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
-    //{
-    //    s_Data[pos] += s_Data[pos - offset];
-    //}
-
    s_Data[pos] += s_Data[pos - 1];
    s_Data[pos] += s_Data[pos - 2];
    s_Data[pos] += s_Data[pos - 4];
@@ -315,7 +310,7 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
        <T_in, T_out, tbDoSqr>
        <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
        (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
-    
+
    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);

    return NPPST_SUCCESS;
@@ -1447,14 +1442,14 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
        //adjust hierarchical partial sums
        for (Ncv32s i=(Ncv32s)partSumNums.size()-3; i>=0; i--)
        {
-            dim3 grid(partSumNums[i+1]);
-            if (grid.x > 65535)
+            dim3 grid_local(partSumNums[i+1]);
+            if (grid_local.x > 65535)
            {
-                grid.y = (grid.x + 65534) / 65535;
-                grid.x = 65535;
+                grid_local.y = (grid_local.x + 65534) / 65535;
+                grid_local.x = 65535;
            }
            removePass2Adjust
-                <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+                <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
                (d_hierSums.ptr() + partSumOffsets[i], partSumNums[i],
                 d_hierSums.ptr() + partSumOffsets[i+1]);

@@ -1463,10 +1458,10 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
    }
    else
    {
-        dim3 grid(partSumNums[1]);
+        dim3 grid_local(partSumNums[1]);
        removePass1Scan
            <true, false>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+            <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
            (d_src, srcLen,
             d_hierSums.ptr(),
             NULL, elemRemove);
@@ -1651,7 +1646,7 @@ __forceinline__ __device__ float getValueMirrorColumn(const int offset,


 __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
-                                              Ncv32f *pDst, 
+                                              Ncv32f *pDst,
                                              NcvSize32u dstSize,
                                              Ncv32u dstStep,
                                              NcvRect32u roi,
@@ -1677,7 +1672,7 @@ __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
    float sum = 0.0f;
    for (int m = 0; m < nKernelSize; ++m)
    {
-        sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width) 
+        sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
            * tex1Dfetch (texKernel, m);
    }

@@ -1709,7 +1704,7 @@ __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
    float sum = 0.0f;
    for (int m = 0; m < nKernelSize; ++m)
    {
-        sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height) 
+        sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
            * tex1Dfetch (texKernel, m);
    }

@@ -1879,7 +1874,7 @@ texture<float, 2, cudaReadModeElementType> tex_src0;
 __global__ void BlendFramesKernel(const float *u, const float *v,   // forward flow
                                  const float *ur, const float *vr, // backward flow
                                  const float *o0, const float *o1, // coverage masks
-                                  int w, int h, int s, 
+                                  int w, int h, int s,
                                  float theta, float *out)
 {
    const int ix = threadIdx.x + blockDim.x * blockIdx.x;
@@ -1903,7 +1898,7 @@ __global__ void BlendFramesKernel(const float *u, const float *v,   // forward f
    if (b0 && b1)
    {
        // pixel is visible on both frames
-        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) + 
+        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
            tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
    }
    else if (b0)
@@ -2004,8 +1999,8 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
    Ncv32f *bwdV = pState->ppBuffers[5]; // backward v
    // warp flow
    ncvAssertReturnNcvStat (
-        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU, 
-        pState->size, 
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU,
+        pState->size,
        pState->nStep,
        pState->pFU,
        pState->pFV,
@@ -2014,8 +2009,8 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
        pState->pos,
        fwdU) );
    ncvAssertReturnNcvStat (
-        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV, 
-        pState->size, 
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV,
+        pState->size,
        pState->nStep,
        pState->pFU,
        pState->pFV,
@@ -2025,8 +2020,8 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
        fwdV) );
    // warp backward flow
    ncvAssertReturnNcvStat (
-        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU, 
-        pState->size, 
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU,
+        pState->size,
        pState->nStep,
        pState->pBU,
        pState->pBV,
@@ -2035,8 +2030,8 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
        1.0f - pState->pos,
        bwdU) );
    ncvAssertReturnNcvStat (
-        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV, 
-        pState->size, 
+        nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV,
+        pState->size,
        pState->nStep,
        pState->pBU,
        pState->pBV,
@@ -2252,7 +2247,7 @@ NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
                                         Ncv32f timeScale,
                                         Ncv32f *pDst)
 {
-    ncvAssertReturn (pSrc != NULL && 
+    ncvAssertReturn (pSrc != NULL &&
        pU   != NULL &&
        pV   != NULL &&
        pDst != NULL, NPPST_NULL_POINTER_ERROR);
@@ -2286,7 +2281,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
                                         Ncv32f timeScale,
                                         Ncv32f *pDst)
 {
-    ncvAssertReturn (pSrc != NULL && 
+    ncvAssertReturn (pSrc != NULL &&
        pU   != NULL &&
        pV   != NULL &&
        pDst != NULL &&
@@ -2375,7 +2370,7 @@ __global__ void resizeSuperSample_32f(NcvSize32u srcSize,
    }

    float rw = (float) srcROI.width;
-    float rh = (float) srcROI.height; 
+    float rh = (float) srcROI.height;

    // source position
    float x = scaleX * (float) ix;
@@ -2529,7 +2524,7 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
    ncvAssertReturn (pSrc != NULL && pDst != NULL, NPPST_NULL_POINTER_ERROR);
    ncvAssertReturn (xFactor != 0.0 && yFactor != 0.0, NPPST_INVALID_SCALE);

-    ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width && 
+    ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width &&
        nDstStep >= sizeof (Ncv32f) * (Ncv32f) dstSize.width,
        NPPST_INVALID_STEP);

@@ -2547,7 +2542,7 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,
            (dstROI.height + ctaSize.y - 1) / ctaSize.y);

-        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> 
+        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
            (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
    }
    else if (interpolation == nppStBicubic)
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
@@ -1,7 +1,7 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
-// 
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
@@ -132,7 +132,7 @@ enum NppStInterpMode


 /** Size of a buffer required for interpolation.
- * 
+ *
 * Requires several such buffers. See \see NppStInterpolationState.
 *
 * \param srcSize           [IN]  Frame size (both frames must be of the same size)
@@ -177,17 +177,17 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
 * \return NCV status code
 */
 NCV_EXPORTS
-NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc, 
-                                        NcvSize32u srcSize, 
+NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+                                        NcvSize32u srcSize,
                                        Ncv32u nSrcStep,
-                                        Ncv32f *pDst, 
-                                        NcvSize32u dstSize, 
+                                        Ncv32f *pDst,
+                                        NcvSize32u dstSize,
                                        Ncv32u nDstStep,
-                                        NcvRect32u oROI, 
+                                        NcvRect32u oROI,
                                        NppStBorderType borderType,
-                                        const Ncv32f *pKernel, 
+                                        const Ncv32f *pKernel,
                                        Ncv32s nKernelSize,
-                                        Ncv32s nAnchor, 
+                                        Ncv32s nAnchor,
                                        Ncv32f multiplier);


@@ -225,14 +225,14 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,


 /** Size of buffer required for vector image warping.
- * 
+ *
 * \param srcSize           [IN]  Source image size
 * \param nStep             [IN]  Source image line step
 * \param hpSize            [OUT] Where to store computed size (host memory)
 *
 * \return NCV status code
 */
-NCV_EXPORTS 
+NCV_EXPORTS
 NCVStatus nppiStVectorWarpGetBufferSize(NcvSize32u srcSize,
                                        Ncv32u nSrcStep,
                                        Ncv32u *hpSize);
@@ -316,7 +316,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
 * \param xFactor           [IN]  Row scale factor
 * \param yFactor           [IN]  Column scale factor
 * \param interpolation     [IN]  Interpolation type
- * 
+ *
 * \return NCV status code
 */
 NCV_EXPORTS
--- a/modules/gpu/src/nvidia/core/NCVColorConversion.hpp
+++ b/modules/gpu/src/nvidia/core/NCVColorConversion.hpp
@@ -1,7 +1,7 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
-// 
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
@@ -39,11 +39,14 @@
 //
 //M*/

+// this file does not contain any used code.
+
 #ifndef _ncv_color_conversion_hpp_
 #define _ncv_color_conversion_hpp_

 #include "NCVPixelOperations.hpp"

+#if 0
 enum NCVColorSpace
 {
    NCVColorSpaceGray,
@@ -71,8 +74,7 @@ static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
 }};

 template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout>
-static
-NCVStatus _ncvColorConv_host(const NCVMatrix<Tin> &h_imgIn,
+static NCVStatus _ncvColorConv_host(const NCVMatrix<Tin> &h_imgIn,
                             const NCVMatrix<Tout> &h_imgOut)
 {
    ncvAssertReturn(h_imgIn.size() == h_imgOut.size(), NCV_DIMENSIONS_INVALID);
@@ -92,5 +94,6 @@ NCVStatus _ncvColorConv_host(const NCVMatrix<Tin> &h_imgIn,
    NCV_SKIP_COND_END
    return NCV_SUCCESS;
 }
+#endif

 #endif //_ncv_color_conversion_hpp_
--- a/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
+++ b/modules/gpu/src/nvidia/core/NCVPixelOperations.hpp
@@ -1,7 +1,7 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
-// 
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
@@ -47,38 +47,38 @@
 #include "NCV.hpp"

 template<typename TBase> inline __host__ __device__ TBase _pixMaxVal();
-template<> static inline __host__ __device__ Ncv8u _pixMaxVal<Ncv8u>() {return UCHAR_MAX;}
+template<> static inline __host__ __device__ Ncv8u  _pixMaxVal<Ncv8u>()  {return UCHAR_MAX;}
 template<> static inline __host__ __device__ Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
-template<> static inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return UINT_MAX;}
-template<> static inline __host__ __device__ Ncv8s _pixMaxVal<Ncv8s>() {return CHAR_MAX;}
-template<> static inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return SHRT_MAX;}
-template<> static inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return INT_MAX;}
-template<> static inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return FLT_MAX;}
-template<> static inline __host__ __device__ Ncv64f _pixMaxVal<Ncv64f>() {return DBL_MAX;}
+template<> static inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return  UINT_MAX;}
+template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  CHAR_MAX;}
+template<> static inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return  SHRT_MAX;}
+template<> static inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return   INT_MAX;}
+template<> static inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return   FLT_MAX;}
+template<> static inline __host__ __device__ Ncv64f _pixMaxVal<Ncv64f>() {return   DBL_MAX;}

 template<typename TBase> inline __host__ __device__ TBase _pixMinVal();
-template<> static inline __host__ __device__ Ncv8u _pixMinVal<Ncv8u>() {return 0;}
+template<> static inline __host__ __device__ Ncv8u  _pixMinVal<Ncv8u>()  {return 0;}
 template<> static inline __host__ __device__ Ncv16u _pixMinVal<Ncv16u>() {return 0;}
 template<> static inline __host__ __device__ Ncv32u _pixMinVal<Ncv32u>() {return 0;}
-template<> static inline __host__ __device__ Ncv8s _pixMinVal<Ncv8s>() {return CHAR_MIN;}
+template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return CHAR_MIN;}
 template<> static inline __host__ __device__ Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
 template<> static inline __host__ __device__ Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
 template<> static inline __host__ __device__ Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
 template<> static inline __host__ __device__ Ncv64f _pixMinVal<Ncv64f>() {return DBL_MIN;}

 template<typename Tvec> struct TConvVec2Base;
-template<> struct TConvVec2Base<uchar1> {typedef Ncv8u TBase;};
-template<> struct TConvVec2Base<uchar3> {typedef Ncv8u TBase;};
-template<> struct TConvVec2Base<uchar4> {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar1>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar3>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar4>  {typedef Ncv8u TBase;};
 template<> struct TConvVec2Base<ushort1> {typedef Ncv16u TBase;};
 template<> struct TConvVec2Base<ushort3> {typedef Ncv16u TBase;};
 template<> struct TConvVec2Base<ushort4> {typedef Ncv16u TBase;};
-template<> struct TConvVec2Base<uint1> {typedef Ncv32u TBase;};
-template<> struct TConvVec2Base<uint3> {typedef Ncv32u TBase;};
-template<> struct TConvVec2Base<uint4> {typedef Ncv32u TBase;};
-template<> struct TConvVec2Base<float1> {typedef Ncv32f TBase;};
-template<> struct TConvVec2Base<float3> {typedef Ncv32f TBase;};
-template<> struct TConvVec2Base<float4> {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<uint1>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint3>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint4>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<float1>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float3>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float4>  {typedef Ncv32f TBase;};
 template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
 template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
 template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
@@ -86,9 +86,9 @@ template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
 #define NC(T)       (sizeof(T) / sizeof(TConvVec2Base<T>::TBase))

 template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
-template<> struct TConvBase2Vec<Ncv8u, 1> {typedef uchar1 TVec;};
-template<> struct TConvBase2Vec<Ncv8u, 3> {typedef uchar3 TVec;};
-template<> struct TConvBase2Vec<Ncv8u, 4> {typedef uchar4 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 1>  {typedef uchar1 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 3>  {typedef uchar3 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 4>  {typedef uchar4 TVec;};
 template<> struct TConvBase2Vec<Ncv16u, 1> {typedef ushort1 TVec;};
 template<> struct TConvBase2Vec<Ncv16u, 3> {typedef ushort3 TVec;};
 template<> struct TConvBase2Vec<Ncv16u, 4> {typedef ushort4 TVec;};
--- a/modules/gpu/src/nvidia/core/NCVPyramid.cu
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.cu
@@ -202,7 +202,7 @@ __global__ void kernelDownsampleX2(T *d_src,
    }
 }

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace pyramid
    {
@@ -211,7 +211,7 @@ namespace cv { namespace gpu { namespace device
            dim3 bDim(16, 8);
            dim3 gDim(divUp(src.cols, bDim.x), divUp(src.rows, bDim.y));

-            kernelDownsampleX2<<<gDim, bDim, 0, stream>>>((T*)src.data, static_cast<Ncv32u>(src.step), 
+            kernelDownsampleX2<<<gDim, bDim, 0, stream>>>((T*)src.data, static_cast<Ncv32u>(src.step),
                (T*)dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));

            cudaSafeCall( cudaGetLastError() );
@@ -277,7 +277,7 @@ __global__ void kernelInterpolateFrom1(T *d_srcTop,
        d_dst_line[j] = outPix;
    }
 }
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace pyramid
    {
@@ -286,7 +286,7 @@ namespace cv { namespace gpu { namespace device
            dim3 bDim(16, 8);
            dim3 gDim(divUp(dst.cols, bDim.x), divUp(dst.rows, bDim.y));

-            kernelInterpolateFrom1<<<gDim, bDim, 0, stream>>>((T*) src.data, static_cast<Ncv32u>(src.step), NcvSize32u(src.cols, src.rows), 
+            kernelInterpolateFrom1<<<gDim, bDim, 0, stream>>>((T*) src.data, static_cast<Ncv32u>(src.step), NcvSize32u(src.cols, src.rows),
                (T*) dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));

            cudaSafeCall( cudaGetLastError() );
--- a/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
@@ -1,7 +1,7 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
-// 
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
@@ -54,14 +54,14 @@
 // The Loki Library
 // Copyright (c) 2001 by Andrei Alexandrescu
 // This code accompanies the book:
-// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design 
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design
 //     Patterns Applied". Copyright (c) 2001. Addison-Wesley.
-// Permission to use, copy, modify, distribute and sell this software for any 
-//     purpose is hereby granted without fee, provided that the above copyright 
-//     notice appear in all copies and that both that copyright notice and this 
+// Permission to use, copy, modify, distribute and sell this software for any
+//     purpose is hereby granted without fee, provided that the above copyright
+//     notice appear in all copies and that both that copyright notice and this
 //     permission notice appear in supporting documentation.
-// The author or Addison-Welsey Longman make no representations about the 
-//     suitability of this software for any purpose. It is provided "as is" 
+// The author or Addison-Welsey Longman make no representations about the
+//     suitability of this software for any purpose. It is provided "as is"
 //     without express or implied warranty.
 // http://loki-lib.sourceforge.net/index.php?n=Main.License
 ////////////////////////////////////////////////////////////////////////////////
@@ -71,7 +71,7 @@ namespace Loki
    //==============================================================================
    // class NullType
    // Used as a placeholder for "no type here"
-    // Useful as an end marker in typelists 
+    // Useful as an end marker in typelists
    //==============================================================================

    class NullType {};
@@ -110,7 +110,7 @@ namespace Loki
        //==============================================================================
        // class template TypeAt
        // Finds the type at a given index in a typelist
-        // Invocation (TList is a typelist and index is a compile-time integral 
+        // Invocation (TList is a typelist and index is a compile-time integral
        //     constant):
        // TypeAt<TList, index>::Result
        // returns the type in position 'index' in TList