1) NPP_staging as sources. Binaries removed.

2) NVidia tests for GPU 3) FD sample that uses NVidia's interface.
2011-02-04 15:15:25 +00:00
parent 811f6fbe92
commit 0747f2d863
52 changed files with 6042 additions and 1240 deletions
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -83,25 +83,25 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz) );		
    }
    else if (src.elemSize() == 4)
    {
-        NppStSize32u sz;
+        NcvSize32u sz;
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<NppSt32u*>(src.ptr<NppSt32u>()), src.step, 
-            dst.ptr<NppSt32u>(), dst.step, sz) );
+        nppSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), src.step, 
+            dst.ptr<Ncv32u>(), dst.step, sz) );
    }
    else // if (src.elemSize() == 8)
    {
-        NppStSize32u sz;
+        NcvSize32u sz;
        sz.width  = src.cols;
        sz.height = src.rows;

-        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<NppSt64u*>(src.ptr<NppSt64u>()), src.step, 
-            dst.ptr<NppSt64u>(), dst.step, sz) );
+        nppSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), src.step, 
+            dst.ptr<Ncv64u>(), dst.step, sz) );		
    }

    cudaSafeCall( cudaThreadSynchronize() );
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -126,7 +126,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
            minNeighbors,
            scaleStep, 1,
            flags,
-            *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor,  0);
+            *gpuAllocator, *cpuAllocator, devProp, 0);
        ncvAssertReturnNcvStat(ncvStat);
        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
                       
@@ -146,8 +146,8 @@ private:
        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);

        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
-        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice);        
-        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned);
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, devProp.textureAlignment);        
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment);

        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
@@ -212,7 +212,7 @@ private:
        roi.height = d_src.height();
        Ncv32u numDetections;
        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
-            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0);
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0);

        ncvAssertReturnNcvStat(ncvStat);
        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -560,16 +560,19 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer)

    sum.create(src.rows + 1, src.cols + 1, CV_32S);
    
-    NppStSize32u roiSize;
+    NcvSize32u roiSize;
    roiSize.width = src.cols;
    roiSize.height = src.rows;

-    NppSt32u bufSize;
-    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize) );
+	cudaDeviceProp prop;
+	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+    Ncv32u bufSize;
+    nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
    ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer);

-    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<NppSt8u*>(src.ptr<NppSt8u>()), src.step, 
-        sum.ptr<NppSt32u>(), sum.step, roiSize, buffer.ptr<NppSt8u>(), bufSize) );
+    nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>()), src.step, 
+        sum.ptr<Ncv32u>(), sum.step, roiSize, buffer.ptr<Ncv8u>(), bufSize, prop) );

    cudaSafeCall( cudaThreadSynchronize() );
 }
@@ -600,19 +603,20 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum)
 {
    CV_Assert(src.type() == CV_8U);

-    NppStSize32u roiSize;
+    NcvSize32u roiSize;
    roiSize.width = src.cols;
    roiSize.height = src.rows;

-    NppSt32u bufSize;
-    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize));
+	cudaDeviceProp prop;
+	cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) );
+
+    Ncv32u bufSize;
+    nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop));	
    GpuMat buf(1, bufSize, CV_8U);

    sqsum.create(src.rows + 1, src.cols + 1, CV_64F);
-    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(
-            const_cast<NppSt8u*>(src.ptr<NppSt8u>(0)), src.step, 
-            sqsum.ptr<NppSt64u>(0), sqsum.step, roiSize, 
-            buf.ptr<NppSt8u>(0), bufSize));
+    nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast<Ncv8u*>(src.ptr<Ncv8u>(0)), src.step, 
+            sqsum.ptr<Ncv64u>(0), sqsum.step, roiSize, buf.ptr<Ncv8u>(0), bufSize, prop));

    cudaSafeCall( cudaThreadSynchronize() );
 }
--- a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
+++ b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
@@ -1,362 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
-// 
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include <cstdio>
-#include <cuda_runtime.h>
-
-#define CV_NO_BACKWARD_COMPATIBILITY
-
-#include "opencv2/opencv.hpp"
-
-#include "NCVHaarObjectDetection.hpp"
-
-using namespace cv;
-using namespace std;
-
-const Size preferredVideoFrameSize(640, 480);
-
-string preferredClassifier = "haarcascade_frontalface_alt.xml";
-string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed";
-
-
-void printSyntax(void)
-{
-    printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n");
-}
-
-
-void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...)
-{    
-    int fontFace = CV_FONT_HERSHEY_PLAIN;
-    double fontScale = 1;       
-    
-    int baseline;
-    Size textSize = cv::getTextSize("T", fontFace, fontScale, 1, &baseline);
-
-    va_list arg_ptr;
-    va_start(arg_ptr, format);
-    int len = _vscprintf(format, arg_ptr) + 1;
-    
-    vector<char> strBuf(len);    
-    vsprintf_s(&strBuf[0], len, format, arg_ptr);
-
-    Point org(1, 3 * textSize.height * (lineOffsY + 1) / 2);    
-    putText(img, &strBuf[0], org, fontFace, fontScale, color);
-    va_end(arg_ptr);    
-}
-
-
-NCVStatus process(Mat *srcdst,
-                  Ncv32u width, Ncv32u height,
-                  NcvBool bShowAllHypotheses, NcvBool bLargestFace,
-                  HaarClassifierCascadeDescriptor &haar,
-                  NCVVector<HaarStage64> &d_haarStages, NCVVector<HaarClassifierNode128> &d_haarNodes,
-                  NCVVector<HaarFeature64> &d_haarFeatures, NCVVector<HaarStage64> &h_haarStages,
-                  INCVMemAllocator &gpuAllocator,
-                  INCVMemAllocator &cpuAllocator,
-                  cudaDeviceProp &devProp)
-{
-    ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR);
-
-    NCVStatus ncvStat;
-
-    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
-
-    NCVMatrixAlloc<Ncv8u> d_src(gpuAllocator, width, height);
-    ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-    NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);
-    ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);        
-    ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
-
-    Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride());
-
-    NCV_SKIP_COND_BEGIN        
-    
-    (*srcdst).copyTo(h_src_hdr);
-    
-    ncvStat = h_src.copySolid(d_src, 0);
-    ncvAssertReturnNcvStat(ncvStat);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-    NCV_SKIP_COND_END
-
-    NcvSize32u roi;
-    roi.width = d_src.width();
-    roi.height = d_src.height();
-
-    Ncv32u numDetections;
-    ncvStat = ncvDetectObjectsMultiScale_device(
-        d_src, roi, d_rects, numDetections, haar, h_haarStages,
-        d_haarStages, d_haarNodes, d_haarFeatures,
-        haar.ClassifierSize,
-        bShowAllHypotheses ? 0 : 4,
-        1.2f, 1,
-        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace,
-        gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0);
-    ncvAssertReturnNcvStat(ncvStat);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-    NCV_SKIP_COND_BEGIN
-
-    ncvStat = d_src.copySolid(h_src, 0);
-    ncvAssertReturnNcvStat(ncvStat);
-    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
-
-    h_src_hdr.copyTo(*srcdst);
-    
-    NCV_SKIP_COND_END
-
-    return NCV_SUCCESS;
-}
-
-
-int main( int argc, const char** argv )
-{
-    NCVStatus ncvStat;
-
-    printf("NVIDIA Computer Vision SDK\n");
-    printf("Face Detection in video and live feed\n");
-    printf("=========================================\n");
-    printf("  Esc   - Quit\n");
-    printf("  Space - Switch between NCV and OpenCV\n");
-    printf("  L     - Switch between FullSearch and LargestFace modes\n");
-    printf("  U     - Toggle unfiltered hypotheses visualization in FullSearch\n");
-    
-    if (argc != 4 && argc != 1)
-        return printSyntax(), -1;
-
-    VideoCapture capture;    
-    Size frameSize;
-
-    if (argc == 1 || strcmp(argv[1], "-c") == 0)
-    {
-        // Camera input is specified
-        int camIdx = (argc == 3) ? atoi(argv[2]) : 0;
-        if(!capture.open(camIdx))        
-            return printf("Error opening camera\n"), -1;        
-            
-        capture.set(CV_CAP_PROP_FRAME_WIDTH, preferredVideoFrameSize.width);
-        capture.set(CV_CAP_PROP_FRAME_HEIGHT, preferredVideoFrameSize.height);
-        capture.set(CV_CAP_PROP_FPS, 25);
-        frameSize = preferredVideoFrameSize;
-    }
-    else if (strcmp(argv[1], "-v") == 0)
-    {
-        // Video file input (avi)
-        if(!capture.open(argv[2]))
-            return printf("Error opening video file\n"), -1;
-
-        frameSize.width  = (int)capture.get(CV_CAP_PROP_FRAME_WIDTH);
-        frameSize.height = (int)capture.get(CV_CAP_PROP_FRAME_HEIGHT);
-    }
-    else
-        return printSyntax(), -1;
-
-    NcvBool bUseOpenCV = true;
-    NcvBool bLargestFace = true;
-    NcvBool bShowAllHypotheses = false;    
-
-    string classifierFile = (argc == 1) ? preferredClassifier : argv[3];
-    
-    CascadeClassifier classifierOpenCV;
-    if (!classifierOpenCV.load(classifierFile))
-        return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1;
-
-    int devId;
-    ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);
-    cudaDeviceProp devProp;
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);
-    printf("Using GPU %d %s, arch=%d.%d\n", devId, devProp.name, devProp.major, devProp.minor);
-
-    //==============================================================================
-    //
-    // Load the classifier from file (assuming its size is about 1 mb)
-    // using a simple allocator
-    //
-    //==============================================================================
-
-    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice);
-    ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);
-    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned);
-    ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);
-
-    Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
-    ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1);
-
-    NCVVectorAlloc<HaarStage64> h_haarStages(cpuCascadeAllocator, haarNumStages);
-    ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1);
-    NCVVectorAlloc<HaarClassifierNode128> h_haarNodes(cpuCascadeAllocator, haarNumNodes);
-    ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1);
-    NCVVectorAlloc<HaarFeature64> h_haarFeatures(cpuCascadeAllocator, haarNumFeatures);
-    ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1);
-
-    HaarClassifierCascadeDescriptor haar;
-    ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, h_haarStages, h_haarNodes, h_haarFeatures);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1);
-
-    NCVVectorAlloc<HaarStage64> d_haarStages(gpuCascadeAllocator, haarNumStages);
-    ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1);
-    NCVVectorAlloc<HaarClassifierNode128> d_haarNodes(gpuCascadeAllocator, haarNumNodes);
-    ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1);
-    NCVVectorAlloc<HaarFeature64> d_haarFeatures(gpuCascadeAllocator, haarNumFeatures);
-    ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1);
-
-    ncvStat = h_haarStages.copySolid(d_haarStages, 0);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
-    ncvStat = h_haarNodes.copySolid(d_haarNodes, 0);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
-    ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
-
-    //==============================================================================
-    //
-    // Calculate memory requirements and create real allocators
-    //
-    //==============================================================================
-
-    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
-    ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1);
-    NCVMemStackAllocator cpuCounter(devProp.textureAlignment);
-    ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1);
-
-    ncvStat = process(NULL, frameSize.width, frameSize.height,
-                      false, false, haar,
-                      d_haarStages, d_haarNodes,
-                      d_haarFeatures, h_haarStages,
-                      gpuCounter, cpuCounter, devProp);
-    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
-
-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);
-    ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1);
-    NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);
-    ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1);
-
-    printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height);
-
-    //==============================================================================
-    //
-    // Main processing loop
-    //
-    //==============================================================================
-    
-    namedWindow(wndTitle, 1);
-
-    Mat frame, gray, frameDisp;
-
-    for(;;)
-    {
-        // For camera and video file, capture the next image                
-        capture >> frame;
-        if (frame.empty())
-            break;
-                
-        cvtColor(frame, gray, CV_BGR2GRAY);
-
-        // process
-        NcvSize32u minSize = haar.ClassifierSize;
-        if (bLargestFace)
-        {
-            Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;
-            Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;
-            Ncv32u ratioSmallest = std::min(ratioX, ratioY);
-            ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f);
-            minSize.width *= ratioSmallest;
-            minSize.height *= ratioSmallest;
-        }
-        
-        NcvTimer timer = ncvStartTimer();
-
-        if (!bUseOpenCV)
-        {
-            ncvStat = process(&gray, frameSize.width, frameSize.height,
-                              bShowAllHypotheses, bLargestFace, haar,
-                              d_haarStages, d_haarNodes,
-                              d_haarFeatures, h_haarStages,
-                              gpuAllocator, cpuAllocator, devProp);
-            ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
-        }
-        else
-        {
-            vector<Rect> rectsOpenCV;
-
-            classifierOpenCV.detectMultiScale(
-                gray,
-                rectsOpenCV,
-                1.2f,
-                bShowAllHypotheses && !bLargestFace ? 0 : 4,
-                (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE,
-                Size(minSize.width, minSize.height));
-
-            for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)
-                rectangle(gray, rectsOpenCV[rt], Scalar(255));
-        }
-
-        Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);
-        
-        cvtColor(gray, frameDisp, CV_GRAY2BGR);
-
-        imagePrintf(frameDisp, 0, CV_RGB(255,  0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":"");
-        imagePrintf(frameDisp, 1, CV_RGB(255,  0,0), "L - Switch FullSearch%s / LargestFace%s modes", bLargestFace?"":" (ON)", bLargestFace?" (ON)":"");
-        imagePrintf(frameDisp, 2, CV_RGB(255,  0,0), "U - Toggle unfiltered hypotheses visualization in FullSearch %s", bShowAllHypotheses?"(ON)":"(OFF)");
-        imagePrintf(frameDisp, 3, CV_RGB(118,185,0), "   Running at %f FPS on %s", 1000.0f / avgTime, bUseOpenCV?"CPU":"GPU");
-
-        cv::imshow(wndTitle, frameDisp);
-
-        switch (cvWaitKey(1))
-        {
-        case ' ':
-            bUseOpenCV = !bUseOpenCV;
-            break;
-        case 'L':case 'l':
-            bLargestFace = !bLargestFace;
-            break;
-        case 'U':case 'u':
-            bShowAllHypotheses = !bShowAllHypotheses;
-            break;
-        case 27:
-            return 0;            
-        }
-    }
-        
-    return 0;
-}
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -57,8 +57,8 @@

 #include <algorithm>

-#include "npp.h"
 #include "NCV.hpp"
+#include "NPP_staging/NPP_staging.hpp"
 #include "NCVRuntimeTemplates.hpp"
 #include "NCVHaarObjectDetection.hpp"

@@ -970,8 +970,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
                                               Ncv32f scaleArea,
                                               INCVMemAllocator &gpuAllocator,
                                               INCVMemAllocator &cpuAllocator,
-                                               Ncv32u devPropMajor,
-                                               Ncv32u devPropMinor,
+                                               cudaDeviceProp &devProp,
                                               cudaStream_t cuStream)
 {
    ncvAssertReturn(d_integralImage.memType() == d_weights.memType() &&
@@ -1077,15 +1076,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
    Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
                                          (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));

-    NcvBool bTexCacheCascade = devPropMajor < 2;
+    NcvBool bTexCacheCascade = devProp.major < 2;
    NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
-    NcvBool bDoAtomicCompaction = devPropMajor >= 2 || (devPropMajor == 1 && devPropMinor >= 3);
+    NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);

    NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
    NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;

    Ncv32u szNppCompactTmpBuf;
-    nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf);
+    nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf, devProp);
    if (bDoAtomicCompaction)
    {
        szNppCompactTmpBuf = 0;
@@ -1185,11 +1184,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
        }
        else
        {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
            nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(),
                                      d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
        }
        numDetections = *hp_numDet;
    }
@@ -1240,11 +1239,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
        }
        else
        {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(),
                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
        }

        swap(d_ptrNowData, d_ptrNowTmp);
@@ -1310,11 +1309,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
        }
        else
        {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
        }

        swap(d_ptrNowData, d_ptrNowTmp);
@@ -1371,11 +1370,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
        }
        else
        {
-            NppStStatus nppSt;
+            NCVStatus nppSt;
            nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
                                      d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
-                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf);
-            ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR);
+                                      d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
+            ncvAssertReturnNcvStat(nppSt);
        }

        swap(d_ptrNowData, d_ptrNowTmp);
@@ -1715,8 +1714,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,

                                            INCVMemAllocator &gpuAllocator,
                                            INCVMemAllocator &cpuAllocator,
-                                            Ncv32u devPropMajor,
-                                            Ncv32u devPropMinor,
+                                            cudaDeviceProp &devProp,
                                            cudaStream_t cuStream)
 {
    ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
@@ -1773,12 +1771,12 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
    NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
    ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-    NppStStatus nppStat;
+    NCVStatus nppStat;
    Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
-    nppStat = nppiStIntegralGetSize_8u32u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
-    nppStat = nppiStSqrIntegralGetSize_8u64u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+    nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
+    nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);
    NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
    ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

@@ -1786,15 +1784,15 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,

    nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
                                       d_integralImage.ptr(), d_integralImage.pitch(),
-                                       NppStSize32u(d_srcImg.width(), d_srcImg.height()),
-                                       d_tmpIIbuf.ptr(), szTmpBufIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+                                       NcvSize32u(d_srcImg.width(), d_srcImg.height()),
+                                       d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);

    nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
                                          d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
-                                          NppStSize32u(d_srcImg.width(), d_srcImg.height()),
-                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral);
-    ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+                                          NcvSize32u(d_srcImg.width(), d_srcImg.height()),
+                                          d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
+    ncvAssertReturnNcvStat(nppStat);

    NCV_SKIP_COND_END

@@ -1859,7 +1857,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
        Ncv32u scale = scalesVector[i];

        NcvSize32u srcRoi, scaledIIRoi, searchRoi;
-        NppStSize32u srcIIRoi;
+        NcvSize32u srcIIRoi;
        srcRoi.width = d_srcImg.width();
        srcRoi.height = d_srcImg.height();
        srcIIRoi.width = srcRoi.width + 1;
@@ -1875,15 +1873,15 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
            d_integralImage.ptr(), d_integralImage.pitch(),
            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
            srcIIRoi, scale, true);
-        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+        ncvAssertReturnNcvStat(nppStat);

        nppStat = nppiStDownsampleNearest_64u_C1R(
            d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
            srcIIRoi, scale, true);
-        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+        ncvAssertReturnNcvStat(nppStat);

-        const NppStRect32u rect(
+        const NcvRect32u rect(
            HAAR_STDDEV_BORDER,
            HAAR_STDDEV_BORDER,
            haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
@@ -1892,9 +1890,9 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
            d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
            d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
            d_rectStdDev.ptr(), d_rectStdDev.pitch(),
-            NppStSize32u(searchRoi.width, searchRoi.height), rect,
+            NcvSize32u(searchRoi.width, searchRoi.height), rect,
            (Ncv32f)scale*scale, true);
-        ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR);
+        ncvAssertReturnNcvStat(nppStat);

        NCV_SKIP_COND_END

@@ -1904,8 +1902,8 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
            detectionsOnThisScale,
            haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
            searchRoi, pixelStep, (Ncv32f)scale*scale,
-            gpuAllocator, cpuAllocator, devPropMajor, devPropMinor, cuStream);
-        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+            gpuAllocator, cpuAllocator, devProp, cuStream);
+        ncvAssertReturnNcvStat(nppStat);

        NCV_SKIP_COND_BEGIN

@@ -2250,6 +2248,10 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
    return ncvStat;
 }

+
+
+
+
 NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
                                   Ncv32u &numHypotheses,
                                   Ncv32u minNeighbors,
@@ -2539,7 +2541,7 @@ NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
 }


-NCVStatus ncvHaarStoreNVBIN_host(std::string &filename,
+NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
                                 HaarClassifierCascadeDescriptor haar,
                                 NCVVector<HaarStage64> &h_HaarStages,
                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@@ -75,13 +75,13 @@ struct HaarFeature64

 #define HaarFeature64_CreateCheck_MaxRectField                  0xFF

-    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight)
+    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u /*clsWidth*/, Ncv32u /*clsHeight*/)
    {
        ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);
-        ((NcvRect8u*)&(this->_ui2.x))->x = rectX;
-        ((NcvRect8u*)&(this->_ui2.x))->y = rectY;
-        ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth;
-        ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight;
+        ((NcvRect8u*)&(this->_ui2.x))->x = (Ncv8u)rectX;
+        ((NcvRect8u*)&(this->_ui2.x))->y = (Ncv8u)rectY;
+        ((NcvRect8u*)&(this->_ui2.x))->width = (Ncv8u)rectWidth;
+        ((NcvRect8u*)&(this->_ui2.x))->height = (Ncv8u)rectHeight;
        return NCV_SUCCESS;
    }

@@ -306,11 +306,11 @@ struct HaarStage64
 };


-NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8);
-NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
-NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
-NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
-NPPST_CT_ASSERT(sizeof(HaarStage64) == 8);
+NCV_CT_ASSERT(sizeof(HaarFeature64) == 8);
+NCV_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
+NCV_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
+NCV_CT_ASSERT(sizeof(HaarStage64) == 8);


 //==============================================================================
@@ -347,7 +347,7 @@ enum
    NCVPipeObjDet_VisualizeInPlace      = 0x004,
 };

-
+NCV_EXPORTS
 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
                                            NcvSize32u srcRoi,
                                            NCVVector<NcvRect32u> &d_dstRects,
@@ -367,15 +367,14 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,

                                            INCVMemAllocator &gpuAllocator,
                                            INCVMemAllocator &cpuAllocator,
-                                            Ncv32u devPropMajor,
-                                            Ncv32u devPropMinor,
+                                            cudaDeviceProp &devProp,
                                            cudaStream_t cuStream);


 #define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF
 #define HAAR_STDDEV_BORDER                  1

-
+NCV_EXPORTS
 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
                                               NCVMatrix<Ncv32f> &d_weights,
                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
@@ -391,11 +390,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImag
                                               Ncv32f scaleArea,
                                               INCVMemAllocator &gpuAllocator,
                                               INCVMemAllocator &cpuAllocator,
-                                               Ncv32u devPropMajor,
-                                               Ncv32u devPropMinor,
+                                               cudaDeviceProp &devProp,
                                               cudaStream_t cuStream);

-
+NCV_EXPORTS
 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
                                             NCVMatrix<Ncv32f> &h_weights,
                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,
@@ -409,7 +407,7 @@ NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
                                             Ncv32u pixelStep,
                                             Ncv32f scaleArea);

-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
                                 Ncv32u dstStride,
                                 Ncv32u dstWidth,
@@ -419,7 +417,7 @@ NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
                                 Ncv8u color,
                                 cudaStream_t cuStream);

-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
                                  Ncv32u dstStride,
                                  Ncv32u dstWidth,
@@ -429,7 +427,7 @@ NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
                                  Ncv32u color,
                                  cudaStream_t cuStream);

-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
                               Ncv32u dstStride,
                               Ncv32u dstWidth,
@@ -438,7 +436,7 @@ NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
                               Ncv32u numRects,
                               Ncv8u color);

-
+NCV_EXPORTS
 NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
                                Ncv32u dstStride,
                                Ncv32u dstWidth,
@@ -450,7 +448,7 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,

 #define RECT_SIMILARITY_PROPORTION      0.2f

-
+NCV_EXPORTS
 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
                                         Ncv32u numPixelMaskDetections,
                                         NCVVector<NcvRect32u> &hypotheses,
@@ -461,7 +459,7 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
                                         Ncv32f curScale,
                                         cudaStream_t cuStream);

-
+NCV_EXPORTS
 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
                                       Ncv32u numPixelMaskDetections,
                                       NCVVector<NcvRect32u> &hypotheses,
@@ -471,18 +469,18 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
                                       Ncv32u rectHeight,
                                       Ncv32f curScale);

-
+NCV_EXPORTS
 NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
                                   Ncv32u &numHypotheses,
                                   Ncv32u minNeighbors,
                                   Ncv32f intersectEps,
                                   NCVVector<Ncv32u> *hypothesesWeights);

-
+NCV_EXPORTS
 NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
                                   Ncv32u &numNodes, Ncv32u &numFeatures);

-
+NCV_EXPORTS
 NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
                                   HaarClassifierCascadeDescriptor &haar,
                                   NCVVector<HaarStage64> &h_HaarStages,
@@ -490,6 +488,7 @@ NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
                                   NCVVector<HaarFeature64> &h_HaarFeatures);


+NCV_EXPORTS
 NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
                                 HaarClassifierCascadeDescriptor haar,
                                 NCVVector<HaarStage64> &h_HaarStages,
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.hpp
@@ -0,0 +1,637 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _npp_staging_hpp_
+#define _npp_staging_hpp_
+
+#include "NCV.hpp"
+
+
+/**
+* \file NPP_staging.hpp
+* NPP Staging Library
+*/
+
+
+/** \defgroup core_npp NPPST Core
+ * Basic functions for CUDA streams management.
+ * @{
+ */
+
+
+/**
+ * Gets an active CUDA stream used by NPPST
+ * NOT THREAD SAFE
+ * \return Current CUDA stream
+ */
+cudaStream_t nppStGetActiveCUDAstream();
+
+
+/**
+ * Sets an active CUDA stream used by NPPST
+ * NOT THREAD SAFE
+ * \param cudaStream        [IN] cudaStream CUDA stream to become current
+ * \return CUDA stream used before
+ */
+cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
+
+
+/*@}*/
+
+
+/** \defgroup nppi NPPST Image Processing
+* @{
+*/
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel.
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest in the source image
+ * \param scale             [IN] Downsampling scale factor (positive integer)
+ * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
+                                          Ncv32u *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
+                                          Ncv32s *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                          Ncv32f *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
+* \see nppiStDownsampleNearest_32u_C1R
+*/
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
+                                          Ncv64u *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
+                                          Ncv64s *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
+ * \see nppiStDownsampleNearest_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
+                                          Ncv64f *d_dst, Ncv32u dstStep,
+                                          NcvSize32u srcRoi, Ncv32u scale,
+                                          NcvBool readThruTexture);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel. Host implementation.
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest in the source image
+ * \param scale             [IN] Downsampling scale factor (positive integer)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
+                                               Ncv32u *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
+                                               Ncv32s *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                               Ncv32f *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
+                                               Ncv64u *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
+                                               Ncv64s *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
+ * \see nppiStDownsampleNearest_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStDownsampleNearest_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
+                                               Ncv64f *h_dst, Ncv32u dstStep,
+                                               NcvSize32u srcRoi, Ncv32u scale);
+
+
+/**
+ * Computes standard deviation for each rectangular region of the input image using integral images.
+ *
+ * \param d_sum             [IN] Integral image pointer (CUDA device memory)
+ * \param sumStep           [IN] Integral image line step
+ * \param d_sqsum           [IN] Squared integral image pointer (CUDA device memory)
+ * \param sqsumStep         [IN] Squared integral image line step
+ * \param d_norm            [OUT] Stddev image pointer (CUDA device memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
+ * \param normStep          [IN] Stddev image line step
+ * \param roi               [IN] Region of interest in the source image
+ * \param rect              [IN] Rectangular region to calculate stddev over
+ * \param scaleArea         [IN] Multiplication factor to account decimated scale
+ * \param readThruTexture   [IN] Performance hint to cache source in texture (true) or read directly (false)
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
+                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
+                                   Ncv32f *d_norm, Ncv32u normStep,
+                                   NcvSize32u roi, NcvRect32u rect,
+                                   Ncv32f scaleArea, NcvBool readThruTexture);
+
+
+/**
+ * Computes standard deviation for each rectangular region of the input image using integral images. Host implementation
+ *
+ * \param h_sum             [IN] Integral image pointer (Host or pinned memory)
+ * \param sumStep           [IN] Integral image line step
+ * \param h_sqsum           [IN] Squared integral image pointer (Host or pinned memory)
+ * \param sqsumStep         [IN] Squared integral image line step
+ * \param h_norm            [OUT] Stddev image pointer (Host or pinned memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image
+ * \param normStep          [IN] Stddev image line step
+ * \param roi               [IN] Region of interest in the source image
+ * \param rect              [IN] Rectangular region to calculate stddev over
+ * \param scaleArea         [IN] Multiplication factor to account decimated scale
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep,
+                                        Ncv64u *h_sqsum, Ncv32u sqsumStep,
+                                        Ncv32f *h_norm, Ncv32u normStep,
+                                        NcvSize32u roi, NcvRect32u rect,
+                                        Ncv32f scaleArea);
+
+
+/**
+ * Transposes an image. 32-bit unsigned pixels, single channel
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStride         [IN] Source image line step
+ * \param d_dst             [OUT] Destination image pointer (CUDA device memory)
+ * \param dstStride         [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32u_C1R(Ncv32u *d_src, Ncv32u srcStride,
+                                  Ncv32u *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit signed pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32s_C1R(Ncv32s *d_src, Ncv32u srcStride,
+                                  Ncv32s *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit float pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32f_C1R(Ncv32f *d_src, Ncv32u srcStride,
+                                  Ncv32f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit unsigned pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64u_C1R(Ncv64u *d_src, Ncv32u srcStride,
+                                  Ncv64u *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit signed pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64s_C1R(Ncv64s *d_src, Ncv32u srcStride,
+                                  Ncv64s *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit float pixels, single channel
+ * \see nppiStTranspose_32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
+                                  Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStride         [IN] Source image line step
+ * \param h_dst             [OUT] Destination image pointer (Host or pinned memory)
+ * \param dstStride         [IN] Destination image line step
+ * \param srcRoi            [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStride,
+                                       Ncv32u *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit signed pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStride,
+                                       Ncv32s *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 32-bit float pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStride,
+                                       Ncv32f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit unsigned pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStride,
+                                       Ncv64u *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit signed pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStride,
+                                       Ncv64s *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Transposes an image. 64-bit float pixels, single channel. Host implementation
+ * \see nppiStTranspose_32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
+                                       Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
+
+
+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ *
+ * \param roiSize           [IN] Size of the input image
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ * \see nppiStIntegralGetSize_8u32u
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination integral image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                   Ncv32u *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                   Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image
+ * \see nppiStIntegral_8u32u_C1R
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
+                                    Ncv32f *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                    Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates an integral image representation for the input image. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination integral image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                        Ncv32u *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/**
+ * Creates an integral image representation for the input image. Host implementation
+ * \see nppiStIntegral_8u32u_C1R_host
+ */
+NCV_EXPORTS
+NCVStatus nppiStIntegral_32f32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
+                                         Ncv32f *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/**
+ * Calculates the size of the temporary buffer for squared integral image creation
+ *
+ * \param roiSize           [IN] Size of the input image
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates a squared integral image representation for the input image
+ *
+ * \param d_src             [IN] Source image pointer (CUDA device memory)
+ * \param srcStep           [IN] Source image line step
+ * \param d_dst             [OUT] Destination squared integral image pointer (CUDA device memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
+                                      Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roiSize,
+                                      Ncv8u *pBuffer, Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Creates a squared integral image representation for the input image. Host implementation
+ *
+ * \param h_src             [IN] Source image pointer (Host or pinned memory)
+ * \param srcStep           [IN] Source image line step
+ * \param h_dst             [OUT] Destination squared integral image pointer (Host or pinned memory)
+ * \param dstStep           [IN] Destination image line step
+ * \param roiSize           [IN] Region of interest of the source image
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
+                                           Ncv64u *h_dst, Ncv32u dstStep, NcvSize32u roiSize);
+
+
+/*@}*/
+
+
+/** \defgroup npps NPPST Signal Processing
+* @{
+*/
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit unsigned values
+ *
+ * \param srcLen            [IN] Length of the input vector in elements
+ * \param pBufsize          [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes)
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit signed values
+ * \see nppsStCompactGetSize_32u
+ */
+NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Calculates the size of the temporary buffer for vector compaction. 32-bit float values
+ * \see nppsStCompactGetSize_32u
+ */
+NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit unsigned values
+ *
+ * \param d_src             [IN] Source vector pointer (CUDA device memory)
+ * \param srcLen            [IN] Source vector length
+ * \param d_dst             [OUT] Destination vector pointer (CUDA device memory)
+ * \param p_dstLen          [OUT] Pointer to the destination vector length (Pinned memory or NULL)
+ * \param elemRemove        [IN] The value to be removed
+ * \param pBuffer           [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory)
+ * \param bufSize           [IN] Size of the pBuffer in bytes
+ * \param devProp           [IN] CUDA device properties structure, containing texture alignment information
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
+                            Ncv32u *d_dst, Ncv32u *p_dstLen,
+                            Ncv32u elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit signed values
+ * \see nppsStCompact_32u
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
+                            Ncv32s *d_dst, Ncv32u *p_dstLen,
+                            Ncv32s elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit float values
+ * \see nppsStCompact_32u
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
+                            Ncv32f *d_dst, Ncv32u *p_dstLen,
+                            Ncv32f elemRemove, Ncv8u *pBuffer,
+                            Ncv32u bufSize, cudaDeviceProp &devProp);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit unsigned values. Host implementation
+ *
+ * \param h_src             [IN] Source vector pointer (CUDA device memory)
+ * \param srcLen            [IN] Source vector length
+ * \param h_dst             [OUT] Destination vector pointer (CUDA device memory)
+ * \param dstLen            [OUT] Pointer to the destination vector length (can be NULL)
+ * \param elemRemove        [IN] The value to be removed
+ *
+ * \return NCV status code
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
+                                 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit signed values. Host implementation
+ * \see nppsStCompact_32u_host
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
+                                 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove);
+
+
+/**
+ * Compacts the input vector by removing elements of specified value. 32-bit float values. Host implementation
+ * \see nppsStCompact_32u_host
+ */
+NCV_EXPORTS
+NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
+                                 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove);
+
+
+/*@}*/
+
+
+#endif // _npp_staging_hpp_
--- a/modules/gpu/src/nvidia/core/NCV.cpp
+++ b/modules/gpu/src/nvidia/core/NCV.cpp
@@ -40,15 +40,13 @@
 //M*/


-#include <precomp.hpp>
-
-
 #if !defined (HAVE_CUDA)


 #else /* !defined (HAVE_CUDA) */


+#include <ios>
 #include <stdarg.h>
 #include "NCV.hpp"

@@ -94,17 +92,6 @@ void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
 //==============================================================================


-NCVStatus GPUAlignmentValue(Ncv32u &alignment)
-{
-    int curDev;
-    cudaDeviceProp curProp;
-    ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR);
-    ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR);
-    alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major);
-    return NCV_SUCCESS;
-}
-
-
 Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
 {
    Ncv32u alignMask = alignment-1;
@@ -216,7 +203,7 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
 }


-NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment)
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
    :
    currentSize(0),
    _maxSize(0),
@@ -229,17 +216,26 @@ NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity,

    allocBegin = NULL;

-    switch (memT)
+    if (reusePtr == NULL)
    {
-    case NCVMemoryTypeDevice:
-        ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
-        break;
-    case NCVMemoryTypeHostPinned:
-        ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
-        break;
-    case NCVMemoryTypeHostPageable:
-        allocBegin = (Ncv8u *)malloc(capacity);
-        break;
+        bReusesMemory = false;
+        switch (memT)
+        {
+        case NCVMemoryTypeDevice:
+            ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPinned:
+            ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPageable:
+            allocBegin = (Ncv8u *)malloc(capacity);
+            break;
+        }
+    }
+    else
+    {
+        bReusesMemory = true;
+        allocBegin = (Ncv8u *)reusePtr;
    }

    if (capacity == 0)
@@ -260,18 +256,23 @@ NCVMemStackAllocator::~NCVMemStackAllocator()
    if (allocBegin != NULL)
    {
        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
-        switch (_memType)
+
+        if (!bReusesMemory)
        {
-        case NCVMemoryTypeDevice:
-            ncvAssertCUDAReturn(cudaFree(allocBegin), );
-            break;
-        case NCVMemoryTypeHostPinned:
-            ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
-            break;
-        case NCVMemoryTypeHostPageable:
-            free(allocBegin);
-            break;
+            switch (_memType)
+            {
+            case NCVMemoryTypeDevice:
+                ncvAssertCUDAReturn(cudaFree(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPinned:
+                ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPageable:
+                free(allocBegin);
+                break;
+            }
        }
+
        allocBegin = NULL;
    }
 }
@@ -356,14 +357,14 @@ size_t NCVMemStackAllocator::maxSize(void) const
 //===================================================================


-NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT)
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
    :
    currentSize(0),
    _maxSize(0),
-    _memType(memT)
+    _memType(memT),
+    _alignment(alignment)
 {
    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
-    ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", );
 }


--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -42,8 +42,49 @@
 #ifndef _ncv_hpp_
 #define _ncv_hpp_

+#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined CVAPI_EXPORTS //&& !defined(__CUDACC__) 
+    #define NCV_EXPORTS __declspec(dllexport)
+#else
+    #define NCV_EXPORTS
+#endif
+
 #include <cuda_runtime.h>
-#include "npp_staging.h"
+
+
+//==============================================================================
+//
+// Compile-time assert functionality
+//
+//==============================================================================
+
+
+/**
+* Compile-time assert namespace
+*/
+namespace NcvCTprep
+{
+    template <bool x>
+    struct CT_ASSERT_FAILURE;
+
+    template <>
+    struct CT_ASSERT_FAILURE<true> {};
+
+    template <int x>
+    struct assertTest{};
+}
+
+
+#define NCV_CT_PREP_PASTE_AUX(a,b)      a##b                           ///< Concatenation indirection macro
+#define NCV_CT_PREP_PASTE(a,b)          NCV_CT_PREP_PASTE_AUX(a, b)  ///< Concatenation macro
+
+
+/**
+* Performs compile-time assertion of a condition on the file scope
+*/
+#define NCV_CT_ASSERT(X) \
+    typedef NcvCTprep::assertTest<sizeof(NcvCTprep::CT_ASSERT_FAILURE< (bool)(X) >)> \
+    NCV_CT_PREP_PASTE(__ct_assert_typedef_, __LINE__)
+


 //==============================================================================
@@ -82,62 +123,72 @@ typedef              float Ncv32f;
 typedef             double Ncv64f;


-typedef struct
+struct NcvRect8u
 {
    Ncv8u x;
    Ncv8u y;
    Ncv8u width;
    Ncv8u height;
-} NcvRect8u;
+    NcvRect8u() : x(0), y(0), width(0), height(0) {};
+    NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
+};


-typedef struct
+struct NcvRect32s
 {
    Ncv32s x;          ///< x-coordinate of upper left corner.
    Ncv32s y;          ///< y-coordinate of upper left corner.
    Ncv32s width;      ///< Rectangle width.
    Ncv32s height;     ///< Rectangle height.
-} NcvRect32s;
+    NcvRect32s() : x(0), y(0), width(0), height(0) {};
+    NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
+};


-typedef struct
+struct NcvRect32u
 {
    Ncv32u x;          ///< x-coordinate of upper left corner.
    Ncv32u y;          ///< y-coordinate of upper left corner.
    Ncv32u width;      ///< Rectangle width.
    Ncv32u height;     ///< Rectangle height.
-} NcvRect32u;
+    NcvRect32u() : x(0), y(0), width(0), height(0) {};
+    NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
+};


-typedef struct 
+struct NcvSize32s
 {
    Ncv32s width;  ///< Rectangle width.
    Ncv32s height; ///< Rectangle height.
-} NcvSize32s;
+    NcvSize32s() : width(0), height(0) {};
+    NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
+};


-typedef struct 
+struct NcvSize32u
 {
    Ncv32u width;  ///< Rectangle width.
    Ncv32u height; ///< Rectangle height.
-} NcvSize32u;
+    NcvSize32u() : width(0), height(0) {};
+    NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
+};


-NPPST_CT_ASSERT(sizeof(NcvBool) <= 4);
-NPPST_CT_ASSERT(sizeof(Ncv64s) == 8);
-NPPST_CT_ASSERT(sizeof(Ncv64u) == 8);
-NPPST_CT_ASSERT(sizeof(Ncv32s) == 4);
-NPPST_CT_ASSERT(sizeof(Ncv32u) == 4);
-NPPST_CT_ASSERT(sizeof(Ncv16s) == 2);
-NPPST_CT_ASSERT(sizeof(Ncv16u) == 2);
-NPPST_CT_ASSERT(sizeof(Ncv8s) == 1);
-NPPST_CT_ASSERT(sizeof(Ncv8u) == 1);
-NPPST_CT_ASSERT(sizeof(Ncv32f) == 4);
-NPPST_CT_ASSERT(sizeof(Ncv64f) == 8);
-NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
-NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
-NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
-NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvBool) <= 4);
+NCV_CT_ASSERT(sizeof(Ncv64s) == 8);
+NCV_CT_ASSERT(sizeof(Ncv64u) == 8);
+NCV_CT_ASSERT(sizeof(Ncv32s) == 4);
+NCV_CT_ASSERT(sizeof(Ncv32u) == 4);
+NCV_CT_ASSERT(sizeof(Ncv16s) == 2);
+NCV_CT_ASSERT(sizeof(Ncv16u) == 2);
+NCV_CT_ASSERT(sizeof(Ncv8s) == 1);
+NCV_CT_ASSERT(sizeof(Ncv8u) == 1);
+NCV_CT_ASSERT(sizeof(Ncv32f) == 4);
+NCV_CT_ASSERT(sizeof(Ncv64f) == 8);
+NCV_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
+NCV_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
+NCV_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));


 //==============================================================================
@@ -162,13 +213,13 @@ const Ncv32u K_LOG2_WARP_SIZE = 5;
 #define NCV_CT_PREP_STRINGIZE(x)        NCV_CT_PREP_STRINGIZE_AUX(x)


-void ncvDebugOutput(const char *msg, ...);
+NCV_EXPORTS void ncvDebugOutput(const char *msg, ...);


 typedef void NCVDebugOutputHandler(const char* msg);


-void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
+NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);


 #define ncvAssertPrintCheck(pred, msg) \
@@ -222,6 +273,7 @@ void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
 */
 enum NCVStatus
 {
+    //NCV statuses
    NCV_SUCCESS,

    NCV_CUDA_ERROR,
@@ -257,6 +309,24 @@ enum NCVStatus
    NCV_NOIMPL_HAAR_TILTED_FEATURES,

    NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,
+
+    //NPP statuses
+    NPPST_SUCCESS = NCV_SUCCESS,              ///< Successful operation (same as NPP_NO_ERROR)
+    NPPST_ERROR,                              ///< Unknown error
+    NPPST_CUDA_KERNEL_EXECUTION_ERROR,        ///< CUDA kernel execution error
+    NPPST_NULL_POINTER_ERROR,                 ///< NULL pointer argument error
+    NPPST_TEXTURE_BIND_ERROR,                 ///< CUDA texture binding error or non-zero offset returned
+    NPPST_MEMCPY_ERROR,                       ///< CUDA memory copy error
+    NPPST_MEM_ALLOC_ERR,                      ///< CUDA memory allocation error
+    NPPST_MEMFREE_ERR,                        ///< CUDA memory deallocation error
+
+    //NPPST statuses
+    NPPST_INVALID_ROI,                        ///< Invalid region of interest argument
+    NPPST_INVALID_STEP,                       ///< Invalid image lines step argument (check sign, alignment, relation to image width)
+    NPPST_INVALID_SCALE,                      ///< Invalid scale parameter passed
+    NPPST_MEM_INSUFFICIENT_BUFFER,            ///< Insufficient user-allocated buffer
+    NPPST_MEM_RESIDENCE_ERROR,                ///< Memory residence error detected (check if pointers should be device or pinned)
+    NPPST_MEM_INTERNAL_ERROR,                 ///< Internal memory management error
 };


@@ -285,11 +355,11 @@ enum NCVStatus

 typedef struct _NcvTimer *NcvTimer;

-NcvTimer ncvStartTimer(void);
+NCV_EXPORTS NcvTimer ncvStartTimer(void);

-double ncvEndQueryTimerUs(NcvTimer t);
+NCV_EXPORTS double ncvEndQueryTimerUs(NcvTimer t);

-double ncvEndQueryTimerMs(NcvTimer t);
+NCV_EXPORTS double ncvEndQueryTimerMs(NcvTimer t);


 //==============================================================================
@@ -299,16 +369,10 @@ double ncvEndQueryTimerMs(NcvTimer t);
 //==============================================================================


-/**
-* Alignment of GPU memory chunks in bytes
-*/
-NCVStatus GPUAlignmentValue(Ncv32u &alignment);
-
-
 /**
 * Calculates the aligned top bound value
 */
-Ncv32u alignUp(Ncv32u what, Ncv32u alignment);
+NCV_EXPORTS Ncv32u alignUp(Ncv32u what, Ncv32u alignment);


 /**
@@ -326,7 +390,7 @@ enum NCVMemoryType
 /**
 * NCVMemPtr
 */
-struct NCVMemPtr
+struct NCV_EXPORTS NCVMemPtr
 {
    void *ptr;
    NCVMemoryType memtype;
@@ -337,7 +401,7 @@ struct NCVMemPtr
 /**
 * NCVMemSegment
 */
-struct NCVMemSegment
+struct NCV_EXPORTS NCVMemSegment
 {
    NCVMemPtr begin;
    size_t size;
@@ -348,7 +412,7 @@ struct NCVMemSegment
 /**
 * INCVMemAllocator (Interface)
 */
-class INCVMemAllocator
+class NCV_EXPORTS INCVMemAllocator
 {
 public:
    virtual ~INCVMemAllocator() = 0;
@@ -370,7 +434,7 @@ inline INCVMemAllocator::~INCVMemAllocator() {}
 /**
 * NCVMemStackAllocator
 */
-class NCVMemStackAllocator : public INCVMemAllocator
+class NCV_EXPORTS NCVMemStackAllocator : public INCVMemAllocator
 {
    NCVMemStackAllocator();
    NCVMemStackAllocator(const NCVMemStackAllocator &);
@@ -378,7 +442,7 @@ class NCVMemStackAllocator : public INCVMemAllocator
 public:

    explicit NCVMemStackAllocator(Ncv32u alignment);
-    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment);
+    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr=NULL);
    virtual ~NCVMemStackAllocator();

    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
@@ -400,17 +464,18 @@ private:
    Ncv8u *end;
    size_t currentSize;
    size_t _maxSize;
+    NcvBool bReusesMemory;
 };


 /**
 * NCVMemNativeAllocator
 */
-class NCVMemNativeAllocator : public INCVMemAllocator
+class NCV_EXPORTS NCVMemNativeAllocator : public INCVMemAllocator
 {
 public:

-    NCVMemNativeAllocator(NCVMemoryType memT);
+    NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment);
    virtual ~NCVMemNativeAllocator();

    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
@@ -438,9 +503,9 @@ private:
 /**
 * Copy dispatcher
 */
-NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
-                           const void *src, NCVMemoryType srcType,
-                           size_t sz, cudaStream_t cuStream);
+NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
+                                       const void *src, NCVMemoryType srcType,
+                                       size_t sz, cudaStream_t cuStream);


 /**
@@ -514,6 +579,7 @@ class NCVVectorAlloc : public NCVVector<T>
 {
    NCVVectorAlloc();
    NCVVectorAlloc(const NCVVectorAlloc &);
+	NCVVectorAlloc& operator=(const NCVVectorAlloc<T>&);	

 public:

@@ -563,8 +629,7 @@ public:
        return allocatedMem;
    }

-private:
-
+private:		
    INCVMemAllocator &allocator;
    NCVMemSegment allocatedMem;
 };
@@ -707,7 +772,7 @@ class NCVMatrixAlloc : public NCVMatrix<T>
 {
    NCVMatrixAlloc();
    NCVMatrixAlloc(const NCVMatrixAlloc &);
-
+	NCVMatrixAlloc& operator=(const NCVMatrixAlloc &);
 public:

    NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0)
--- a/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp
@@ -1,3 +1,51 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+
+#include <stdarg.h>
+#include <vector>
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // The Loki Library
 // Copyright (c) 2001 by Andrei Alexandrescu
@@ -14,13 +62,6 @@
 // http://loki-lib.sourceforge.net/index.php?n=Main.License
 ////////////////////////////////////////////////////////////////////////////////

-#ifndef _ncvruntimetemplates_hpp_
-#define _ncvruntimetemplates_hpp_
-
-#include <stdarg.h>
-#include <vector>
-
-
 namespace Loki
 {
    //==============================================================================
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -68,51 +68,51 @@ namespace cv { namespace gpu { namespace device

    //! Read Write Traits

-    template <size_t src_elem_size, size_t dst_elem_size>
-    struct UnReadWriteTraits_
-    {
-        enum {shift=1};
-    };
-    template <size_t src_elem_size>
-    struct UnReadWriteTraits_<src_elem_size, 1>
-    {
-        enum {shift=4};
-    };
-    template <size_t src_elem_size>
-    struct UnReadWriteTraits_<src_elem_size, 2>
-    {
-        enum {shift=2};
+    template <size_t src_elem_size, size_t dst_elem_size>
+    struct UnReadWriteTraits_
+    {
+        enum {shift=1};
    };
-    template <typename T, typename D> struct UnReadWriteTraits
-    {
-        enum {shift=UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift};
-        
-        typedef typename TypeVec<T, shift>::vec_t read_type;
-        typedef typename TypeVec<D, shift>::vec_t write_type;
+    template <size_t src_elem_size>
+    struct UnReadWriteTraits_<src_elem_size, 1>
+    {
+        enum {shift=4};
+    };
+    template <size_t src_elem_size>
+    struct UnReadWriteTraits_<src_elem_size, 2>
+    {
+        enum {shift=2};
+    };
+    template <typename T, typename D> struct UnReadWriteTraits
+    {
+        enum {shift=UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift};
+        
+        typedef typename TypeVec<T, shift>::vec_t read_type;
+        typedef typename TypeVec<D, shift>::vec_t write_type;
    };

-    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
-    struct BinReadWriteTraits_
-    {
-        enum {shift=1};
+    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
+    struct BinReadWriteTraits_
+    {
+        enum {shift=1};
    };
-    template <size_t src_elem_size1, size_t src_elem_size2>
-    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
-    {
-        enum {shift=4};
+    template <size_t src_elem_size1, size_t src_elem_size2>
+    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
+    {
+        enum {shift=4};
    };
-    template <size_t src_elem_size1, size_t src_elem_size2>
-    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
-    {
-        enum {shift=2};
+    template <size_t src_elem_size1, size_t src_elem_size2>
+    struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
+    {
+        enum {shift=2};
    };
-    template <typename T1, typename T2, typename D> struct BinReadWriteTraits
-    {
-        enum {shift=BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
-
-        typedef typename TypeVec<T1, shift>::vec_t read_type1;
-        typedef typename TypeVec<T2, shift>::vec_t read_type2;
-        typedef typename TypeVec<D , shift>::vec_t write_type;
+    template <typename T1, typename T2, typename D> struct BinReadWriteTraits
+    {
+        enum {shift=BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};
+
+        typedef typename TypeVec<T1, shift>::vec_t read_type1;
+        typedef typename TypeVec<T2, shift>::vec_t read_type2;
+        typedef typename TypeVec<D , shift>::vec_t write_type;
    };

    //! Transform kernels
@@ -122,14 +122,14 @@ namespace cv { namespace gpu { namespace device
    {
        template <typename T, typename D, typename UnOp, typename Mask>
        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
                dst.x = op(src.x);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
                dst.x = op(src1.x, src2.x);
        }
@@ -138,18 +138,18 @@ namespace cv { namespace gpu { namespace device
    {
        template <typename T, typename D, typename UnOp, typename Mask>
        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
-                dst.x = op(src.x);
+                dst.x = op(src.x);
            if (mask(y, x_shifted + 1))
                dst.y = op(src.y);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
+                dst.x = op(src1.x, src2.x);
            if (mask(y, x_shifted + 1))
                dst.y = op(src1.y, src2.y);
        }
@@ -158,22 +158,22 @@ namespace cv { namespace gpu { namespace device
    {
        template <typename T, typename D, typename UnOp, typename Mask>
        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
-                dst.x = op(src.x);
+                dst.x = op(src.x);
            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
+                dst.y = op(src.y);
            if (mask(y, x_shifted + 2))
                dst.z = op(src.z);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
+                dst.x = op(src1.x, src2.x);
            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
+                dst.y = op(src1.y, src2.y);
            if (mask(y, x_shifted + 2))
                dst.z = op(src1.z, src2.z);
        }
@@ -182,65 +182,65 @@ namespace cv { namespace gpu { namespace device
    {
        template <typename T, typename D, typename UnOp, typename Mask>
        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
-                dst.x = op(src.x);
+                dst.x = op(src.x);
            if (mask(y, x_shifted + 1))
-                dst.y = op(src.y);
+                dst.y = op(src.y);
            if (mask(y, x_shifted + 2))
-                dst.z = op(src.z);
+                dst.z = op(src.z);
            if (mask(y, x_shifted + 3))
                dst.w = op(src.w);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-        {
+        {
            if (mask(y, x_shifted))
-                dst.x = op(src1.x, src2.x);
+                dst.x = op(src1.x, src2.x);
            if (mask(y, x_shifted + 1))
-                dst.y = op(src1.y, src2.y);
+                dst.y = op(src1.y, src2.y);
            if (mask(y, x_shifted + 2))
-                dst.z = op(src1.z, src2.z);
+                dst.z = op(src1.z, src2.z);
            if (mask(y, x_shifted + 3))
                dst.w = op(src1.w, src2.w);
        }
    };

-    template <typename T, typename D, typename UnOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
-    {
-        typedef typename UnReadWriteTraits<T, D>::read_type read_type;
-        typedef typename UnReadWriteTraits<T, D>::write_type write_type;
-        const int shift = UnReadWriteTraits<T, D>::shift;
-
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * shift;
-
-        if (y < src_.rows)
-        {
-            const T* src = src_.ptr(y);
-            D* dst = dst_.ptr(y);
-
-            if (x_shifted + shift - 1 < src_.cols)
-            {
-                read_type src_n_el = ((const read_type*)src)[x];
-                write_type dst_n_el;
-
-                OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                ((write_type*)dst)[x] = dst_n_el;
-            }
-            else
-            {
-                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
-                {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src[real_x]);
-                }
-            }
-        }
+    template <typename T, typename D, typename UnOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, UnOp op)
+    {
+        typedef typename UnReadWriteTraits<T, D>::read_type read_type;
+        typedef typename UnReadWriteTraits<T, D>::write_type write_type;
+        const int shift = UnReadWriteTraits<T, D>::shift;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * shift;
+
+        if (y < src_.rows)
+        {
+            const T* src = src_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + shift - 1 < src_.cols)
+            {
+                read_type src_n_el = ((const read_type*)src)[x];
+                write_type dst_n_el;
+
+                OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src[real_x]);
+                }
+            }
+        }
    }

    template <typename T, typename D, typename UnOp, typename Mask>
@@ -255,44 +255,44 @@ namespace cv { namespace gpu { namespace device
        }
    }

-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
-        const Mask mask, BinOp op)
-    {
-        typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
-        typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
-        typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
-        const int shift = BinReadWriteTraits<T1, T2, D>::shift;
-
-        const int x = threadIdx.x + blockIdx.x * blockDim.x;
-        const int y = threadIdx.y + blockIdx.y * blockDim.y;
-        const int x_shifted = x * shift;
-
-        if (y < src1_.rows)
-        {
-            const T1* src1 = src1_.ptr(y);
-            const T2* src2 = src2_.ptr(y);
-            D* dst = dst_.ptr(y);
-
-            if (x_shifted + shift - 1 < src1_.cols)
-            {
-                read_type1 src1_n_el = ((const read_type1*)src1)[x];
-                read_type2 src2_n_el = ((const read_type2*)src2)[x];
-                write_type dst_n_el;
-                
-                OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                ((write_type*)dst)[x] = dst_n_el;
-            }
-            else
-            {
-                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
-                {
-                    if (mask(y, real_x))
-                        dst[real_x] = op(src1[real_x], src2[real_x]);
-                }
-            }
-        }
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
+        const Mask mask, BinOp op)
+    {
+        typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
+        typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
+        typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
+        const int shift = BinReadWriteTraits<T1, T2, D>::shift;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * shift;
+
+        if (y < src1_.rows)
+        {
+            const T1* src1 = src1_.ptr(y);
+            const T2* src2 = src2_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + shift - 1 < src1_.cols)
+            {
+                read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                write_type dst_n_el;
+                
+                OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src1[real_x], src2[real_x]);
+                }
+            }
+        }
    }

    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
@@ -355,11 +355,11 @@ namespace cv
            template <typename T, typename D, typename UnOp, typename Mask>
            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
                             cudaStream_t stream = 0)
-            {
+            {
                const int shift = device::UnReadWriteTraits<T, D>::shift;

                dim3 threads(16, 16, 1);
-                dim3 grid(1, 1, 1);            
+                dim3 grid(1, 1, 1);            

                grid.x = divUp(src.cols, threads.x * shift);
                grid.y = divUp(src.rows, threads.y);        
@@ -373,7 +373,7 @@ namespace cv
            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
                             BinOp op, const Mask& mask, cudaStream_t stream = 0)
-            {
+            {
                const int shift = device::BinReadWriteTraits<T1, T2, D>::shift;

                dim3 threads(16, 16, 1);
@@ -392,7 +392,7 @@ namespace cv
        template <typename T, typename D, typename UnOp, typename Mask>
        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
            cudaStream_t stream = 0)
-        {
+        {
            TransformChooser<device::VecTraits<T>::cn == 1 && device::VecTraits<D>::cn == 1 && device::UnReadWriteTraits<T, D>::shift != 1>::call(src, dst, op, mask, stream);
        }

--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -69,9 +69,9 @@
    #include "cufft.h"
    #include "opencv2/gpu/stream_accessor.hpp"
    #include "npp.h"    
-    #include "npp_staging.h"
-
-	#include "nvidia/NCV.hpp"
+    
+	#include "nvidia/core/NCV.hpp"
+	#include "nvidia/NPP_staging/npp_staging.hpp"
 	#include "nvidia/NCVHaarObjectDetection.hpp"

 #define CUDART_MINIMUM_REQUIRED_VERSION 3020