First version of CascadeClassifier_GPU.

Only for VS2008 now. Sample for it. new NPP_staging for VS2008 only
2011-01-13 13:04:00 +00:00
parent 31e582e314
commit 1a94186195
17 changed files with 6067 additions and 186 deletions
--- a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
+++ b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
@@ -0,0 +1,362 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+#define CV_NO_BACKWARD_COMPATIBILITY
+
+#include "opencv2/opencv.hpp"
+
+#include "NCVHaarObjectDetection.hpp"
+
+using namespace cv;
+using namespace std;
+
+const Size preferredVideoFrameSize(640, 480);
+
+string preferredClassifier = "haarcascade_frontalface_alt.xml";
+string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed";
+
+
+void printSyntax(void)
+{
+    printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n");
+}
+
+
+void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...)
+{    
+    int fontFace = CV_FONT_HERSHEY_PLAIN;
+    double fontScale = 1;       
+    
+    int baseline;
+    Size textSize = cv::getTextSize("T", fontFace, fontScale, 1, &baseline);
+
+    va_list arg_ptr;
+    va_start(arg_ptr, format);
+    int len = _vscprintf(format, arg_ptr) + 1;
+    
+    vector<char> strBuf(len);    
+    vsprintf_s(&strBuf[0], len, format, arg_ptr);
+
+    Point org(1, 3 * textSize.height * (lineOffsY + 1) / 2);    
+    putText(img, &strBuf[0], org, fontFace, fontScale, color);
+    va_end(arg_ptr);    
+}
+
+
+NCVStatus process(Mat *srcdst,
+                  Ncv32u width, Ncv32u height,
+                  NcvBool bShowAllHypotheses, NcvBool bLargestFace,
+                  HaarClassifierCascadeDescriptor &haar,
+                  NCVVector<HaarStage64> &d_haarStages, NCVVector<HaarClassifierNode128> &d_haarNodes,
+                  NCVVector<HaarFeature64> &d_haarFeatures, NCVVector<HaarStage64> &h_haarStages,
+                  INCVMemAllocator &gpuAllocator,
+                  INCVMemAllocator &cpuAllocator,
+                  cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR);
+
+    NCVStatus ncvStat;
+
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    NCVMatrixAlloc<Ncv8u> d_src(gpuAllocator, width, height);
+    ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);
+    ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);        
+    ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride());
+
+    NCV_SKIP_COND_BEGIN        
+    
+    (*srcdst).copyTo(h_src_hdr);
+    
+    ncvStat = h_src.copySolid(d_src, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    NCV_SKIP_COND_END
+
+    NcvSize32u roi;
+    roi.width = d_src.width();
+    roi.height = d_src.height();
+
+    Ncv32u numDetections;
+    ncvStat = ncvDetectObjectsMultiScale_device(
+        d_src, roi, d_rects, numDetections, haar, h_haarStages,
+        d_haarStages, d_haarNodes, d_haarFeatures,
+        haar.ClassifierSize,
+        bShowAllHypotheses ? 0 : 4,
+        1.2f, 1,
+        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace,
+        gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = d_src.copySolid(h_src, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    h_src_hdr.copyTo(*srcdst);
+    
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+int main( int argc, const char** argv )
+{
+    NCVStatus ncvStat;
+
+    printf("NVIDIA Computer Vision SDK\n");
+    printf("Face Detection in video and live feed\n");
+    printf("=========================================\n");
+    printf("  Esc   - Quit\n");
+    printf("  Space - Switch between NCV and OpenCV\n");
+    printf("  L     - Switch between FullSearch and LargestFace modes\n");
+    printf("  U     - Toggle unfiltered hypotheses visualization in FullSearch\n");
+    
+    if (argc != 4 && argc != 1)
+        return printSyntax(), -1;
+
+    VideoCapture capture;    
+    Size frameSize;
+
+    if (argc == 1 || strcmp(argv[1], "-c") == 0)
+    {
+        // Camera input is specified
+        int camIdx = (argc == 3) ? atoi(argv[2]) : 0;
+        if(!capture.open(camIdx))        
+            return printf("Error opening camera\n"), -1;        
+            
+        capture.set(CV_CAP_PROP_FRAME_WIDTH, preferredVideoFrameSize.width);
+        capture.set(CV_CAP_PROP_FRAME_HEIGHT, preferredVideoFrameSize.height);
+        capture.set(CV_CAP_PROP_FPS, 25);
+        frameSize = preferredVideoFrameSize;
+    }
+    else if (strcmp(argv[1], "-v") == 0)
+    {
+        // Video file input (avi)
+        if(!capture.open(argv[2]))
+            return printf("Error opening video file\n"), -1;
+
+        frameSize.width  = (int)capture.get(CV_CAP_PROP_FRAME_WIDTH);
+        frameSize.height = (int)capture.get(CV_CAP_PROP_FRAME_HEIGHT);
+    }
+    else
+        return printSyntax(), -1;
+
+    NcvBool bUseOpenCV = true;
+    NcvBool bLargestFace = true;
+    NcvBool bShowAllHypotheses = false;    
+
+    string classifierFile = (argc == 1) ? preferredClassifier : argv[3];
+    
+    CascadeClassifier classifierOpenCV;
+    if (!classifierOpenCV.load(classifierFile))
+        return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1;
+
+    int devId;
+    ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);
+    cudaDeviceProp devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);
+    printf("Using GPU %d %s, arch=%d.%d\n", devId, devProp.name, devProp.major, devProp.minor);
+
+    //==============================================================================
+    //
+    // Load the classifier from file (assuming its size is about 1 mb)
+    // using a simple allocator
+    //
+    //==============================================================================
+
+    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice);
+    ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);
+    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned);
+    ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);
+
+    Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
+    ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1);
+
+    NCVVectorAlloc<HaarStage64> h_haarStages(cpuCascadeAllocator, haarNumStages);
+    ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1);
+    NCVVectorAlloc<HaarClassifierNode128> h_haarNodes(cpuCascadeAllocator, haarNumNodes);
+    ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1);
+    NCVVectorAlloc<HaarFeature64> h_haarFeatures(cpuCascadeAllocator, haarNumFeatures);
+    ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1);
+
+    HaarClassifierCascadeDescriptor haar;
+    ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, h_haarStages, h_haarNodes, h_haarFeatures);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1);
+
+    NCVVectorAlloc<HaarStage64> d_haarStages(gpuCascadeAllocator, haarNumStages);
+    ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1);
+    NCVVectorAlloc<HaarClassifierNode128> d_haarNodes(gpuCascadeAllocator, haarNumNodes);
+    ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1);
+    NCVVectorAlloc<HaarFeature64> d_haarFeatures(gpuCascadeAllocator, haarNumFeatures);
+    ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1);
+
+    ncvStat = h_haarStages.copySolid(d_haarStages, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+    ncvStat = h_haarNodes.copySolid(d_haarNodes, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+    ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+
+    //==============================================================================
+    //
+    // Calculate memory requirements and create real allocators
+    //
+    //==============================================================================
+
+    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1);
+    NCVMemStackAllocator cpuCounter(devProp.textureAlignment);
+    ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1);
+
+    ncvStat = process(NULL, frameSize.width, frameSize.height,
+                      false, false, haar,
+                      d_haarStages, d_haarNodes,
+                      d_haarFeatures, h_haarStages,
+                      gpuCounter, cpuCounter, devProp);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
+
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);
+    ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1);
+    NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);
+    ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1);
+
+    printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height);
+
+    //==============================================================================
+    //
+    // Main processing loop
+    //
+    //==============================================================================
+    
+    namedWindow(wndTitle, 1);
+
+    Mat frame, gray, frameDisp;
+
+    for(;;)
+    {
+        // For camera and video file, capture the next image                
+        capture >> frame;
+        if (frame.empty())
+            break;
+                
+        cvtColor(frame, gray, CV_BGR2GRAY);
+
+        // process
+        NcvSize32u minSize = haar.ClassifierSize;
+        if (bLargestFace)
+        {
+            Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;
+            Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;
+            Ncv32u ratioSmallest = std::min(ratioX, ratioY);
+            ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f);
+            minSize.width *= ratioSmallest;
+            minSize.height *= ratioSmallest;
+        }
+        
+        NcvTimer timer = ncvStartTimer();
+
+        if (!bUseOpenCV)
+        {
+            ncvStat = process(&gray, frameSize.width, frameSize.height,
+                              bShowAllHypotheses, bLargestFace, haar,
+                              d_haarStages, d_haarNodes,
+                              d_haarFeatures, h_haarStages,
+                              gpuAllocator, cpuAllocator, devProp);
+            ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
+        }
+        else
+        {
+            vector<Rect> rectsOpenCV;
+
+            classifierOpenCV.detectMultiScale(
+                gray,
+                rectsOpenCV,
+                1.2f,
+                bShowAllHypotheses && !bLargestFace ? 0 : 4,
+                (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE,
+                Size(minSize.width, minSize.height));
+
+            for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)
+                rectangle(gray, rectsOpenCV[rt], Scalar(255));
+        }
+
+        Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);
+        
+        cvtColor(gray, frameDisp, CV_GRAY2BGR);
+
+        imagePrintf(frameDisp, 0, CV_RGB(255,  0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":"");
+        imagePrintf(frameDisp, 1, CV_RGB(255,  0,0), "L - Switch FullSearch%s / LargestFace%s modes", bLargestFace?"":" (ON)", bLargestFace?" (ON)":"");
+        imagePrintf(frameDisp, 2, CV_RGB(255,  0,0), "U - Toggle unfiltered hypotheses visualization in FullSearch %s", bShowAllHypotheses?"(ON)":"(OFF)");
+        imagePrintf(frameDisp, 3, CV_RGB(118,185,0), "   Running at %f FPS on %s", 1000.0f / avgTime, bUseOpenCV?"CPU":"GPU");
+
+        cv::imshow(wndTitle, frameDisp);
+
+        switch (cvWaitKey(1))
+        {
+        case ' ':
+            bUseOpenCV = !bUseOpenCV;
+            break;
+        case 'L':case 'l':
+            bLargestFace = !bLargestFace;
+            break;
+        case 'U':case 'u':
+            bShowAllHypotheses = !bShowAllHypotheses;
+            break;
+        case 27:
+            return 0;            
+        }
+    }
+        
+    return 0;
+}
--- a/modules/gpu/src/nvidia/NCV.cpp
+++ b/modules/gpu/src/nvidia/NCV.cpp
@@ -0,0 +1,571 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include <precomp.hpp>
+
+
+#if !defined (HAVE_CUDA)
+
+
+#else /* !defined (HAVE_CUDA) */
+
+
+#include <stdarg.h>
+#include "NCV.hpp"
+
+
+//==============================================================================
+//
+// Error handling helpers
+//
+//==============================================================================
+
+
+static void stdioDebugOutput(const char *msg)
+{
+    printf("%s", msg);
+}
+
+
+static NCVDebugOutputHandler *debugOutputHandler = stdioDebugOutput;
+
+
+void ncvDebugOutput(const char *msg, ...)
+{
+    const int K_DEBUG_STRING_MAXLEN = 1024;
+    char buffer[K_DEBUG_STRING_MAXLEN];
+    va_list args;
+    va_start(args, msg);
+    vsnprintf_s(buffer, K_DEBUG_STRING_MAXLEN, K_DEBUG_STRING_MAXLEN-1, msg, args);
+    va_end (args);
+    debugOutputHandler(buffer);
+}
+
+
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
+{
+    debugOutputHandler = func;
+}
+
+
+//==============================================================================
+//
+// Memory wrappers and helpers
+//
+//==============================================================================
+
+
+NCVStatus GPUAlignmentValue(Ncv32u &alignment)
+{
+    int curDev;
+    cudaDeviceProp curProp;
+    ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR);
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR);
+    alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major);
+    return NCV_SUCCESS;
+}
+
+
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
+{
+    Ncv32u alignMask = alignment-1;
+    Ncv32u inverseAlignMask = ~alignMask;
+    Ncv32u res = (what + alignMask) & inverseAlignMask;
+    return res;
+}
+
+
+void NCVMemPtr::clear()
+{
+    ptr = NULL;
+    memtype = NCVMemoryTypeNone;
+}
+
+
+void NCVMemSegment::clear()
+{
+    begin.clear();
+    size = 0;
+}
+
+
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, size_t sz, cudaStream_t cuStream)
+{
+    NCVStatus ncvStat;
+    switch (dstType)
+    {
+    case NCVMemoryTypeHostPageable:
+    case NCVMemoryTypeHostPinned:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            memcpy(dst, src, sz);
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    case NCVMemoryTypeDevice:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    default:
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;
+    }
+
+    return ncvStat;
+}
+
+
+//===================================================================
+//
+// NCVMemStackAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
+    :
+    currentSize(0),
+    _maxSize(0),
+    allocBegin(NULL),
+    begin(NULL),
+    _memType(NCVMemoryTypeNone),
+    _alignment(alignment)
+{
+    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
+}
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment)
+    :
+    currentSize(0),
+    _maxSize(0),
+    allocBegin(NULL),
+    _memType(memT),
+    _alignment(alignment)
+{
+    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
+
+    allocBegin = NULL;
+
+    switch (memT)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
+        break;
+    case NCVMemoryTypeHostPageable:
+        allocBegin = (Ncv8u *)malloc(capacity);
+        break;
+    }
+
+    if (capacity == 0)
+    {
+        allocBegin = (Ncv8u *)(0x1);
+    }
+
+    if (!isCounting())
+    {
+        begin = allocBegin;
+        end = begin + capacity;
+    }
+}
+
+
+NCVMemStackAllocator::~NCVMemStackAllocator()
+{
+    if (allocBegin != NULL)
+    {
+        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
+        switch (_memType)
+        {
+        case NCVMemoryTypeDevice:
+            ncvAssertCUDAReturn(cudaFree(allocBegin), );
+            break;
+        case NCVMemoryTypeHostPinned:
+            ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
+            break;
+        case NCVMemoryTypeHostPageable:
+            free(allocBegin);
+            break;
+        }
+        allocBegin = NULL;
+    }
+}
+
+
+NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    size = alignUp(size, this->_alignment);
+    this->currentSize += size;
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    if (!isCounting())
+    {
+        size_t availSize = end - begin;
+        ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY);
+    }
+
+    seg.begin.ptr = begin;
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+    begin += size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER);
+
+    currentSize -= seg.size;
+    begin -= seg.size;
+
+    seg.clear();
+
+    ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemStackAllocator::isInitialized(void) const
+{
+    return ((this->_alignment & (this->_alignment-1)) == 0) && isCounting() || this->allocBegin != NULL;
+}
+
+
+NcvBool NCVMemStackAllocator::isCounting(void) const
+{
+    return this->_memType == NCVMemoryTypeNone;
+}
+
+
+NCVMemoryType NCVMemStackAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemStackAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemStackAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// NCVMemNativeAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT)
+    :
+    currentSize(0),
+    _maxSize(0),
+    _memType(memT)
+{
+    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
+    ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", );
+}
+
+
+NCVMemNativeAllocator::~NCVMemNativeAllocator()
+{
+    ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak");
+}
+
+
+NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        seg.begin.ptr = (Ncv8u *)malloc(size);
+        break;
+    }
+
+    this->currentSize += alignUp(size, this->_alignment);
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
+    currentSize -= alignUp(seg.size, this->_alignment);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        free(seg.begin.ptr);
+        break;
+    }
+
+    seg.clear();
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemNativeAllocator::isInitialized(void) const
+{
+    return (this->_alignment != 0);
+}
+
+
+NcvBool NCVMemNativeAllocator::isCounting(void) const
+{
+    return false;
+}
+
+
+NCVMemoryType NCVMemNativeAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemNativeAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemNativeAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// Time and timer routines
+//
+//===================================================================
+
+
+typedef struct _NcvTimeMoment NcvTimeMoment;
+
+#if defined(_WIN32) || defined(_WIN64)
+
+    #include <Windows.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        LONGLONG moment, freq;
+    } NcvTimeMoment;
+
+
+    static void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        QueryPerformanceFrequency((LARGE_INTEGER *)&(t->freq));
+        QueryPerformanceCounter((LARGE_INTEGER *)&(t->moment));
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->moment / t->freq;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+
+    double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+#elif defined(__unix__)
+
+    #include <sys/time.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        struct timeval tv; 
+        struct timezone tz;
+    } NcvTimeMoment;
+
+
+    void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        gettimeofday(& t->tv, & t->tz);
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->tv.tv_sec + (double)t->tv.tv_usec;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return (((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000000 + (double)t2->tv.tv_usec - (double)t1->tv.tv_usec);
+    }
+
+
+#endif //#if defined(_WIN32) || defined(_WIN64)
+
+
+struct _NcvTimer
+{
+    NcvTimeMoment t1, t2;
+};
+
+
+NcvTimer ncvStartTimer(void)
+{
+    struct _NcvTimer *t;
+    t = (struct _NcvTimer *)malloc(sizeof(struct _NcvTimer));
+    _ncvQueryMoment(&t->t1);
+    return t;
+}
+
+
+double ncvEndQueryTimerUs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMicroseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+
+double ncvEndQueryTimerMs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMilliseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/nvidia/NCV.hpp
+++ b/modules/gpu/src/nvidia/NCV.hpp
@@ -0,0 +1,837 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_hpp_
+#define _ncv_hpp_
+
+#include <cuda_runtime.h>
+#include "npp_staging.h"
+
+
+//==============================================================================
+//
+// Alignment macros
+//
+//==============================================================================
+
+
+#if !defined(__align__) && !defined(__CUDACC__)
+    #if defined(_WIN32) || defined(_WIN64)
+        #define __align__(n)         __declspec(align(n))
+    #elif defined(__unix__)
+        #define __align__(n)         __attribute__((__aligned__(n)))
+    #endif
+#endif
+
+
+//==============================================================================
+//
+// Integral and compound types of guaranteed size
+//
+//==============================================================================
+
+
+typedef               bool NcvBool;
+typedef          long long Ncv64s;
+typedef unsigned long long Ncv64u;
+typedef                int Ncv32s;
+typedef       unsigned int Ncv32u;
+typedef              short Ncv16s;
+typedef     unsigned short Ncv16u;
+typedef               char Ncv8s;
+typedef      unsigned char Ncv8u;
+typedef              float Ncv32f;
+typedef             double Ncv64f;
+
+
+typedef struct
+{
+    Ncv8u x;
+    Ncv8u y;
+    Ncv8u width;
+    Ncv8u height;
+} NcvRect8u;
+
+
+typedef struct
+{
+    Ncv32s x;          ///< x-coordinate of upper left corner.
+    Ncv32s y;          ///< y-coordinate of upper left corner.
+    Ncv32s width;      ///< Rectangle width.
+    Ncv32s height;     ///< Rectangle height.
+} NcvRect32s;
+
+
+typedef struct
+{
+    Ncv32u x;          ///< x-coordinate of upper left corner.
+    Ncv32u y;          ///< y-coordinate of upper left corner.
+    Ncv32u width;      ///< Rectangle width.
+    Ncv32u height;     ///< Rectangle height.
+} NcvRect32u;
+
+
+typedef struct 
+{
+    Ncv32s width;  ///< Rectangle width.
+    Ncv32s height; ///< Rectangle height.
+} NcvSize32s;
+
+
+typedef struct 
+{
+    Ncv32u width;  ///< Rectangle width.
+    Ncv32u height; ///< Rectangle height.
+} NcvSize32u;
+
+
+NPPST_CT_ASSERT(sizeof(NcvBool) <= 4);
+NPPST_CT_ASSERT(sizeof(Ncv64s) == 8);
+NPPST_CT_ASSERT(sizeof(Ncv64u) == 8);
+NPPST_CT_ASSERT(sizeof(Ncv32s) == 4);
+NPPST_CT_ASSERT(sizeof(Ncv32u) == 4);
+NPPST_CT_ASSERT(sizeof(Ncv16s) == 2);
+NPPST_CT_ASSERT(sizeof(Ncv16u) == 2);
+NPPST_CT_ASSERT(sizeof(Ncv8s) == 1);
+NPPST_CT_ASSERT(sizeof(Ncv8u) == 1);
+NPPST_CT_ASSERT(sizeof(Ncv32f) == 4);
+NPPST_CT_ASSERT(sizeof(Ncv64f) == 8);
+NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
+NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
+NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
+NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
+
+
+//==============================================================================
+//
+// Persistent constants
+//
+//==============================================================================
+
+
+const Ncv32u K_WARP_SIZE = 32;
+const Ncv32u K_LOG2_WARP_SIZE = 5;
+
+
+//==============================================================================
+//
+// Error handling
+//
+//==============================================================================
+
+
+#define NCV_CT_PREP_STRINGIZE_AUX(x)    #x
+#define NCV_CT_PREP_STRINGIZE(x)        NCV_CT_PREP_STRINGIZE_AUX(x)
+
+
+void ncvDebugOutput(const char *msg, ...);
+
+
+typedef void NCVDebugOutputHandler(const char* msg);
+
+
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
+
+
+#define ncvAssertPrintCheck(pred, msg) \
+    ((pred) ? true : (ncvDebugOutput("\n%s\n", \
+    "NCV Assertion Failed: " msg ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__) \
+    ), false))
+
+
+#define ncvAssertPrintReturn(pred, msg, err) \
+    if (ncvAssertPrintCheck(pred, msg)) ; else return err
+
+
+#define ncvAssertReturn(pred, err) \
+    do \
+    { \
+        if (!(pred)) \
+        { \
+            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: retcode=", (int)err, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
+            return err; \
+        } \
+    } while (0)
+
+
+#define ncvAssertReturnNcvStat(ncvOp) \
+    do \
+    { \
+        NCVStatus _ncvStat = ncvOp; \
+        if (NCV_SUCCESS != _ncvStat) \
+        { \
+            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: NcvStat=", (int)_ncvStat, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
+            return _ncvStat; \
+        } \
+    } while (0)
+
+
+#define ncvAssertCUDAReturn(cudacall, errCode) \
+    do \
+    { \
+        cudaError_t resCall = cudacall; \
+        cudaError_t resGLE = cudaGetLastError(); \
+        if (cudaSuccess != resCall || cudaSuccess != resGLE) \
+        { \
+            ncvDebugOutput("\n%s%d%s\n", "NCV CUDA Assertion Failed: cudaError_t=", (int)(resCall | resGLE), ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
+            return errCode; \
+        } \
+    } while (0)
+
+
+/**
+* Return-codes for status notification, errors and warnings
+*/
+enum NCVStatus
+{
+    NCV_SUCCESS,
+
+    NCV_CUDA_ERROR,
+    NCV_NPP_ERROR,
+    NCV_FILE_ERROR,
+
+    NCV_NULL_PTR,
+    NCV_INCONSISTENT_INPUT,
+    NCV_TEXTURE_BIND_ERROR,
+    NCV_DIMENSIONS_INVALID,
+
+    NCV_INVALID_ROI,
+    NCV_INVALID_STEP,
+    NCV_INVALID_SCALE,
+
+    NCV_ALLOCATOR_NOT_INITIALIZED,
+    NCV_ALLOCATOR_BAD_ALLOC,
+    NCV_ALLOCATOR_BAD_DEALLOC,
+    NCV_ALLOCATOR_INSUFFICIENT_CAPACITY,
+    NCV_ALLOCATOR_DEALLOC_ORDER,
+    NCV_ALLOCATOR_BAD_REUSE,
+
+    NCV_MEM_COPY_ERROR,
+    NCV_MEM_RESIDENCE_ERROR,
+    NCV_MEM_INSUFFICIENT_CAPACITY,
+
+    NCV_HAAR_INVALID_PIXEL_STEP,
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER,
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE,
+    NCV_HAAR_TOO_LARGE_FEATURES,
+    NCV_HAAR_XML_LOADING_EXCEPTION,
+
+    NCV_NOIMPL_HAAR_TILTED_FEATURES,
+
+    NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,
+};
+
+
+#define NCV_SET_SKIP_COND(x) \
+    bool __ncv_skip_cond = x
+
+
+#define NCV_RESET_SKIP_COND(x) \
+    __ncv_skip_cond = x
+
+
+#define NCV_SKIP_COND_BEGIN \
+    if (!__ncv_skip_cond) {
+
+
+#define NCV_SKIP_COND_END \
+    }
+
+
+//==============================================================================
+//
+// Timer
+//
+//==============================================================================
+
+
+typedef struct _NcvTimer *NcvTimer;
+
+NcvTimer ncvStartTimer(void);
+
+double ncvEndQueryTimerUs(NcvTimer t);
+
+double ncvEndQueryTimerMs(NcvTimer t);
+
+
+//==============================================================================
+//
+// Memory management classes template compound types
+//
+//==============================================================================
+
+
+/**
+* Alignment of GPU memory chunks in bytes
+*/
+NCVStatus GPUAlignmentValue(Ncv32u &alignment);
+
+
+/**
+* Calculates the aligned top bound value
+*/
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment);
+
+
+/**
+* NCVMemoryType
+*/
+enum NCVMemoryType
+{
+    NCVMemoryTypeNone,
+    NCVMemoryTypeHostPageable,
+    NCVMemoryTypeHostPinned,
+    NCVMemoryTypeDevice
+};
+
+
+/**
+* NCVMemPtr
+*/
+struct NCVMemPtr
+{
+    void *ptr;
+    NCVMemoryType memtype;
+    void clear();
+};
+
+
+/**
+* NCVMemSegment
+*/
+struct NCVMemSegment
+{
+    NCVMemPtr begin;
+    size_t size;
+    void clear();
+};
+
+
+/**
+* INCVMemAllocator (Interface)
+*/
+class INCVMemAllocator
+{
+public:
+    virtual ~INCVMemAllocator() = 0;
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size) = 0;
+    virtual NCVStatus dealloc(NCVMemSegment &seg) = 0;
+
+    virtual NcvBool isInitialized(void) const = 0;
+    virtual NcvBool isCounting(void) const = 0;
+    
+    virtual NCVMemoryType memType(void) const = 0;
+    virtual Ncv32u alignment(void) const = 0;
+    virtual size_t maxSize(void) const = 0;
+};
+
+inline INCVMemAllocator::~INCVMemAllocator() {}
+
+
+/**
+* NCVMemStackAllocator
+*/
+class NCVMemStackAllocator : public INCVMemAllocator
+{
+    NCVMemStackAllocator();
+    NCVMemStackAllocator(const NCVMemStackAllocator &);
+
+public:
+
+    explicit NCVMemStackAllocator(Ncv32u alignment);
+    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment);
+    virtual ~NCVMemStackAllocator();
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
+    virtual NCVStatus dealloc(NCVMemSegment &seg);
+
+    virtual NcvBool isInitialized(void) const;
+    virtual NcvBool isCounting(void) const;
+
+    virtual NCVMemoryType memType(void) const;
+    virtual Ncv32u alignment(void) const;
+    virtual size_t maxSize(void) const;
+
+private:
+
+    NCVMemoryType _memType;
+    Ncv32u _alignment;
+    Ncv8u *allocBegin;
+    Ncv8u *begin;
+    Ncv8u *end;
+    size_t currentSize;
+    size_t _maxSize;
+};
+
+
+/**
+* NCVMemNativeAllocator
+*/
+class NCVMemNativeAllocator : public INCVMemAllocator
+{
+public:
+
+    NCVMemNativeAllocator(NCVMemoryType memT);
+    virtual ~NCVMemNativeAllocator();
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
+    virtual NCVStatus dealloc(NCVMemSegment &seg);
+
+    virtual NcvBool isInitialized(void) const;
+    virtual NcvBool isCounting(void) const;
+
+    virtual NCVMemoryType memType(void) const;
+    virtual Ncv32u alignment(void) const;
+    virtual size_t maxSize(void) const;
+
+private:
+
+    NCVMemNativeAllocator();
+    NCVMemNativeAllocator(const NCVMemNativeAllocator &);
+
+    NCVMemoryType _memType;
+    Ncv32u _alignment;
+    size_t currentSize;
+    size_t _maxSize;
+};
+
+
+/**
+* Copy dispatcher
+*/
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
+                           const void *src, NCVMemoryType srcType,
+                           size_t sz, cudaStream_t cuStream);
+
+
+/**
+* NCVVector (1D)
+*/
+template <class T>
+class NCVVector
+{
+    NCVVector(const NCVVector &);
+
+public:
+
+    NCVVector()
+    {
+        clear();
+    }
+
+	virtual ~NCVVector() {}
+
+    void clear()
+    {
+        _ptr = NULL;
+        _length = 0;
+        _memtype = NCVMemoryTypeNone;
+    }
+
+    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    {
+        if (howMuch == 0)
+        {
+            ncvAssertReturn(dst._length == this->_length, NCV_MEM_COPY_ERROR);
+            howMuch = this->_length * sizeof(T);
+        }
+        else
+        {
+            ncvAssertReturn(dst._length * sizeof(T) >= howMuch && 
+                this->_length * sizeof(T) >= howMuch &&
+                howMuch > 0, NCV_MEM_COPY_ERROR);
+        }
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && 
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype,
+                                       this->_ptr, this->_memtype,
+                                       howMuch, cuStream);
+        }
+
+        return ncvStat;
+    }
+
+    T *ptr() const {return this->_ptr;}
+    size_t length() const {return this->_length;}
+    NCVMemoryType memType() const {return this->_memtype;}
+
+protected:
+
+    T *_ptr;
+    size_t _length;
+    NCVMemoryType _memtype;
+};
+
+
+/**
+* NCVVectorAlloc
+*/
+template <class T>
+class NCVVectorAlloc : public NCVVector<T>
+{
+    NCVVectorAlloc();
+    NCVVectorAlloc(const NCVVectorAlloc &);
+
+public:
+
+    NCVVectorAlloc(INCVMemAllocator &allocator, Ncv32u length)
+        :
+        allocator(allocator)
+    {
+        NCVStatus ncvStat;
+
+        this->clear();
+        this->allocatedMem.clear();
+
+        ncvStat = allocator.alloc(this->allocatedMem, length * sizeof(T));
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVVectorAlloc ctor:: alloc failed", );
+
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;
+        this->_length = length;
+        this->_memtype = this->allocatedMem.begin.memtype;
+    }
+
+
+    ~NCVVectorAlloc()
+    {
+        NCVStatus ncvStat;
+
+        ncvStat = allocator.dealloc(this->allocatedMem);
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVVectorAlloc dtor:: dealloc failed");
+
+        this->clear();
+    }
+
+
+    NcvBool isMemAllocated() const
+    {
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
+    }
+
+
+    Ncv32u getAllocatorsAlignment() const
+    {
+        return allocator.alignment();
+    }
+
+
+    NCVMemSegment getSegment() const
+    {
+        return allocatedMem;
+    }
+
+private:
+
+    INCVMemAllocator &allocator;
+    NCVMemSegment allocatedMem;
+};
+
+
+/**
+* NCVVectorReuse
+*/
+template <class T>
+class NCVVectorReuse : public NCVVector<T>
+{
+    NCVVectorReuse();
+    NCVVectorReuse(const NCVVectorReuse &);
+
+public:
+
+    explicit NCVVectorReuse(const NCVMemSegment &memSegment)
+    {
+        this->bReused = false;
+        this->clear();
+
+        this->_length = memSegment.size / sizeof(T);
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+
+    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
+    {
+        this->bReused = false;
+        this->clear();
+
+        ncvAssertPrintReturn(length * sizeof(T) <= memSegment.size, \
+            "NCVVectorReuse ctor:: memory binding failed due to size mismatch", );
+
+        this->_length = length;
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+
+    NcvBool isMemReused() const
+    {
+        return this->bReused;
+    }
+
+private:
+
+    NcvBool bReused;
+};
+
+
+/**
+* NCVMatrix (2D)
+*/
+template <class T>
+class NCVMatrix
+{
+    NCVMatrix(const NCVMatrix &);
+
+public:
+
+    NCVMatrix()
+    {
+        clear();
+    }
+
+	virtual ~NCVMatrix() {}
+
+
+    void clear()
+    {
+        _ptr = NULL;
+        _pitch = 0;
+        _width = 0;
+        _height = 0;
+        _memtype = NCVMemoryTypeNone;
+    }
+
+
+    Ncv32u stride() const
+    {
+        return _pitch / sizeof(T);
+    }
+
+
+    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    {
+        if (howMuch == 0)
+        {
+            ncvAssertReturn(dst._pitch == this->_pitch &&
+                            dst._height == this->_height, NCV_MEM_COPY_ERROR);
+            howMuch = this->_pitch * this->_height;
+        }
+        else
+        {
+            ncvAssertReturn(dst._pitch * dst._height >= howMuch && 
+                            this->_pitch * this->_height >= howMuch &&
+                            howMuch > 0, NCV_MEM_COPY_ERROR);
+        }
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && 
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype, 
+                                       this->_ptr, this->_memtype, 
+                                       howMuch, cuStream);
+        }
+
+        return ncvStat;
+    }
+
+    T *ptr() const {return this->_ptr;}
+    Ncv32u width() const {return this->_width;}
+    Ncv32u height() const {return this->_height;}
+    Ncv32u pitch() const {return this->_pitch;}
+    NCVMemoryType memType() const {return this->_memtype;}
+
+protected:
+
+    T *_ptr;
+    Ncv32u _width;
+    Ncv32u _height;
+    Ncv32u _pitch;
+    NCVMemoryType _memtype;
+};
+
+
+/**
+* NCVMatrixAlloc
+*/
+template <class T>
+class NCVMatrixAlloc : public NCVMatrix<T>
+{
+    NCVMatrixAlloc();
+    NCVMatrixAlloc(const NCVMatrixAlloc &);
+
+public:
+
+    NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0)
+        :
+        allocator(allocator)
+    {
+        NCVStatus ncvStat;
+
+        this->clear();
+        this->allocatedMem.clear();
+
+        Ncv32u widthBytes = width * sizeof(T);
+        Ncv32u pitchBytes = alignUp(widthBytes, allocator.alignment());
+
+        if (pitch != 0)
+        {
+            ncvAssertPrintReturn(pitch >= pitchBytes &&
+                (pitch & (allocator.alignment() - 1)) == 0,
+                "NCVMatrixAlloc ctor:: incorrect pitch passed", );
+            pitchBytes = pitch;
+        }
+
+        Ncv32u requiredAllocSize = pitchBytes * height;
+
+        ncvStat = allocator.alloc(this->allocatedMem, requiredAllocSize);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc ctor:: alloc failed", );
+
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;
+        this->_width = width;
+        this->_height = height;
+        this->_pitch = pitchBytes;
+        this->_memtype = this->allocatedMem.begin.memtype;
+    }
+
+    ~NCVMatrixAlloc()
+    {
+        NCVStatus ncvStat;
+
+        ncvStat = allocator.dealloc(this->allocatedMem);
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc dtor:: dealloc failed");
+
+        this->clear();
+    }
+
+
+    NcvBool isMemAllocated() const
+    {
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
+    }
+
+
+    Ncv32u getAllocatorsAlignment() const
+    {
+        return allocator.alignment();
+    }
+
+
+    NCVMemSegment getSegment() const
+    {
+        return allocatedMem;
+    }
+
+private:
+
+    INCVMemAllocator &allocator;
+    NCVMemSegment allocatedMem;
+};
+
+
+/**
+* NCVMatrixReuse
+*/
+template <class T>
+class NCVMatrixReuse : public NCVMatrix<T>
+{
+    NCVMatrixReuse();
+    NCVMatrixReuse(const NCVMatrixReuse &);
+
+public:
+
+    NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width, Ncv32u height, Ncv32u pitch=0, NcvBool bSkipPitchCheck=false)
+    {
+        this->bReused = false;
+        this->clear();
+
+        Ncv32u widthBytes = width * sizeof(T);
+        Ncv32u pitchBytes = alignUp(widthBytes, alignment);
+
+        if (pitch != 0)
+        {
+            if (!bSkipPitchCheck)
+            {
+                ncvAssertPrintReturn(pitch >= pitchBytes &&
+                    (pitch & (alignment - 1)) == 0,
+                    "NCVMatrixReuse ctor:: incorrect pitch passed", );
+            }
+            else
+            {
+                ncvAssertPrintReturn(pitch >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", );
+            }
+            pitchBytes = pitch;
+        }
+
+        ncvAssertPrintReturn(pitchBytes * height <= memSegment.size, \
+            "NCVMatrixReuse ctor:: memory binding failed due to size mismatch", );
+
+        this->_width = width;
+        this->_height = height;
+        this->_pitch = pitchBytes;
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+
+    NcvBool isMemReused() const
+    {
+        return this->bReused;
+    }
+
+private:
+
+    NcvBool bReused;
+};
+
+#endif // _ncv_hpp_
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@@ -0,0 +1,501 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
+//
+// The algorithm and code are explained in the upcoming GPU Computing Gems
+// chapter in detail:
+//
+//   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
+//   PDF URL placeholder
+//   email: aobukhov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ncvhaarobjectdetection_hpp_
+#define _ncvhaarobjectdetection_hpp_
+
+#include <string>
+#include "NCV.hpp"
+
+
+//==============================================================================
+//
+// Guaranteed size cross-platform classifier structures
+//
+//==============================================================================
+
+
+struct HaarFeature64
+{
+    uint2 _ui2;
+
+#define HaarFeature64_CreateCheck_MaxRectField                  0xFF
+
+    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight)
+    {
+        ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);
+        ((NcvRect8u*)&(this->_ui2.x))->x = rectX;
+        ((NcvRect8u*)&(this->_ui2.x))->y = rectY;
+        ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth;
+        ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setWeight(Ncv32f weight)
+    {
+        ((Ncv32f*)&(this->_ui2.y))[0] = weight;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ void getRect(Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
+    {
+        NcvRect8u tmpRect = *(NcvRect8u*)(&this->_ui2.x);
+        *rectX = tmpRect.x;
+        *rectY = tmpRect.y;
+        *rectWidth = tmpRect.width;
+        *rectHeight = tmpRect.height;
+    }
+
+    __device__ __host__ Ncv32f getWeight(void)
+    {
+        return *(Ncv32f*)(&this->_ui2.y);
+    }
+};
+
+
+struct HaarFeatureDescriptor32
+{
+private:
+
+#define HaarFeatureDescriptor32_Interpret_MaskFlagTilted        0x80000000
+#define HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures      0x7F
+#define HaarFeatureDescriptor32_NumFeatures_Shift               24
+#define HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset    0x00FFFFFF
+
+    Ncv32u desc;
+
+public:
+
+    __host__ NCVStatus create(NcvBool bTilted, Ncv32u numFeatures, Ncv32u offsetFeatures)
+    {
+        if (numFeatures > HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures)
+        {
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER;
+        }
+        if (offsetFeatures > HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset)
+        {
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE;
+        }
+        this->desc = 0;
+        this->desc |= (bTilted ? HaarFeatureDescriptor32_Interpret_MaskFlagTilted : 0);
+        this->desc |= (numFeatures << HaarFeatureDescriptor32_NumFeatures_Shift);
+        this->desc |= offsetFeatures;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ NcvBool isTilted(void)
+    {
+        return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagTilted) != 0;
+    }
+
+    __device__ __host__ Ncv32u getNumFeatures(void)
+    {
+        return (this->desc & ~HaarFeatureDescriptor32_Interpret_MaskFlagTilted) >> HaarFeatureDescriptor32_NumFeatures_Shift;
+    }
+
+    __device__ __host__ Ncv32u getFeaturesOffset(void)
+    {
+        return this->desc & HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset;
+    }
+};
+
+
+struct HaarClassifierNodeDescriptor32
+{
+    uint1 _ui1;
+
+#define HaarClassifierNodeDescriptor32_Interpret_MaskSwitch     (1 << 30)
+
+    __host__ NCVStatus create(Ncv32f leafValue)
+    {
+        if ((*(Ncv32u *)&leafValue) & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        *(Ncv32f *)&this->_ui1 = leafValue;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus create(Ncv32u offsetHaarClassifierNode)
+    {
+        if (offsetHaarClassifierNode >= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui1.x = offsetHaarClassifierNode;
+        this->_ui1.x |= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ NcvBool isLeaf(void)
+    {
+        return !(this->_ui1.x & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);
+    }
+
+    __host__ Ncv32f getLeafValueHost(void)
+    {
+        return *(Ncv32f *)&this->_ui1.x;
+    }
+
+#ifdef __CUDACC__
+    __device__ Ncv32f getLeafValue(void)
+    {
+        return __int_as_float(this->_ui1.x);
+    }
+#endif
+
+    __device__ __host__ Ncv32u getNextNodeOffset(void)
+    {
+        return (this->_ui1.x & ~HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);
+    }
+};
+
+
+struct HaarClassifierNode128
+{
+    uint4 _ui4;
+
+    __host__ NCVStatus setFeatureDesc(HaarFeatureDescriptor32 f)
+    {
+        this->_ui4.x = *(Ncv32u *)&f;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setThreshold(Ncv32f t)
+    {
+        this->_ui4.y = *(Ncv32u *)&t;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)
+    {
+        this->_ui4.z = *(Ncv32u *)&nl;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)
+    {
+        this->_ui4.w = *(Ncv32u *)&nr;
+        return NCV_SUCCESS;
+    }
+
+    __host__ __device__ HaarFeatureDescriptor32 getFeatureDesc(void)
+    {
+        return *(HaarFeatureDescriptor32 *)&this->_ui4.x;
+    }
+
+    __host__ __device__ Ncv32f getThreshold(void)
+    {
+        return *(Ncv32f*)&this->_ui4.y;
+    }
+
+    __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)
+    {
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.z;
+    }
+
+    __host__ __device__ HaarClassifierNodeDescriptor32 getRightNodeDesc(void)
+    {
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.w;
+    }
+};
+
+
+struct HaarStage64
+{
+#define HaarStage64_Interpret_MaskRootNodes         0x0000FFFF
+#define HaarStage64_Interpret_MaskRootNodeOffset    0xFFFF0000
+#define HaarStage64_Interpret_ShiftRootNodeOffset   16
+
+    uint2 _ui2;
+
+    __host__ NCVStatus setStageThreshold(Ncv32f t)
+    {
+        this->_ui2.x = *(Ncv32u *)&t;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setStartClassifierRootNodeOffset(Ncv32u val)
+    {
+        if (val > (HaarStage64_Interpret_MaskRootNodeOffset >> HaarStage64_Interpret_ShiftRootNodeOffset))
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui2.y = (val << HaarStage64_Interpret_ShiftRootNodeOffset) | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setNumClassifierRootNodes(Ncv32u val)
+    {
+        if (val > HaarStage64_Interpret_MaskRootNodes)
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui2.y = val | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodeOffset);
+        return NCV_SUCCESS;
+    }
+
+    __host__ __device__ Ncv32f getStageThreshold(void)
+    {
+        return *(Ncv32f*)&this->_ui2.x;
+    }
+
+    __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)
+    {
+        return (this->_ui2.y >> HaarStage64_Interpret_ShiftRootNodeOffset);
+    }
+
+    __host__ __device__ Ncv32u getNumClassifierRootNodes(void)
+    {
+        return (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);
+    }
+};
+
+
+NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8);
+NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
+NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
+NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
+NPPST_CT_ASSERT(sizeof(HaarStage64) == 8);
+
+
+//==============================================================================
+//
+// Classifier cascade descriptor
+//
+//==============================================================================
+
+
+struct HaarClassifierCascadeDescriptor
+{
+    Ncv32u NumStages;
+    Ncv32u NumClassifierRootNodes;
+    Ncv32u NumClassifierTotalNodes;
+    Ncv32u NumFeatures;
+    NcvSize32u ClassifierSize;
+    NcvBool bNeedsTiltedII;
+    NcvBool bHasStumpsOnly;
+};
+
+
+//==============================================================================
+//
+// Functional interface
+//
+//==============================================================================
+
+
+enum
+{
+    NCVPipeObjDet_Default               = 0x000,
+    NCVPipeObjDet_UseFairImageScaling   = 0x001,
+    NCVPipeObjDet_FindLargestObject     = 0x002,
+    NCVPipeObjDet_VisualizeInPlace      = 0x004,
+};
+
+
+NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
+                                            NcvSize32u srcRoi,
+                                            NCVVector<NcvRect32u> &d_dstRects,
+                                            Ncv32u &dstNumRects,
+
+                                            HaarClassifierCascadeDescriptor &haar,
+                                            NCVVector<HaarStage64> &h_HaarStages,
+                                            NCVVector<HaarStage64> &d_HaarStages,
+                                            NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                            NCVVector<HaarFeature64> &d_HaarFeatures,
+
+                                            NcvSize32u minObjSize,
+                                            Ncv32u minNeighbors,      //default 4
+                                            Ncv32f scaleStep,         //default 1.2f
+                                            Ncv32u pixelStep,         //default 1
+                                            Ncv32u flags,             //default NCVPipeObjDet_Default
+
+                                            INCVMemAllocator &gpuAllocator,
+                                            INCVMemAllocator &cpuAllocator,
+                                            Ncv32u devPropMajor,
+                                            Ncv32u devPropMinor,
+                                            cudaStream_t cuStream);
+
+
+#define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF
+#define HAAR_STDDEV_BORDER                  1
+
+
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
+                                               NCVMatrix<Ncv32f> &d_weights,
+                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
+                                               Ncv32u &numDetections,
+                                               HaarClassifierCascadeDescriptor &haar,
+                                               NCVVector<HaarStage64> &h_HaarStages,
+                                               NCVVector<HaarStage64> &d_HaarStages,
+                                               NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                               NCVVector<HaarFeature64> &d_HaarFeatures,
+                                               NcvBool bMaskElements,
+                                               NcvSize32u anchorsRoi,
+                                               Ncv32u pixelStep,
+                                               Ncv32f scaleArea,
+                                               INCVMemAllocator &gpuAllocator,
+                                               INCVMemAllocator &cpuAllocator,
+                                               Ncv32u devPropMajor,
+                                               Ncv32u devPropMinor,
+                                               cudaStream_t cuStream);
+
+
+NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
+                                             NCVMatrix<Ncv32f> &h_weights,
+                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,
+                                             Ncv32u &numDetections,
+                                             HaarClassifierCascadeDescriptor &haar,
+                                             NCVVector<HaarStage64> &h_HaarStages,
+                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                             NCVVector<HaarFeature64> &h_HaarFeatures,
+                                             NcvBool bMaskElements,
+                                             NcvSize32u anchorsRoi,
+                                             Ncv32u pixelStep,
+                                             Ncv32f scaleArea);
+
+
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
+                                 Ncv32u dstStride,
+                                 Ncv32u dstWidth,
+                                 Ncv32u dstHeight,
+                                 NcvRect32u *d_rects,
+                                 Ncv32u numRects,
+                                 Ncv8u color,
+                                 cudaStream_t cuStream);
+
+
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
+                                  Ncv32u dstStride,
+                                  Ncv32u dstWidth,
+                                  Ncv32u dstHeight,
+                                  NcvRect32u *d_rects,
+                                  Ncv32u numRects,
+                                  Ncv32u color,
+                                  cudaStream_t cuStream);
+
+
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
+                               Ncv32u dstStride,
+                               Ncv32u dstWidth,
+                               Ncv32u dstHeight,
+                               NcvRect32u *h_rects,
+                               Ncv32u numRects,
+                               Ncv8u color);
+
+
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
+                                Ncv32u dstStride,
+                                Ncv32u dstWidth,
+                                Ncv32u dstHeight,
+                                NcvRect32u *h_rects,
+                                Ncv32u numRects,
+                                Ncv32u color);
+
+
+#define RECT_SIMILARITY_PROPORTION      0.2f
+
+
+NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
+                                         Ncv32u numPixelMaskDetections,
+                                         NCVVector<NcvRect32u> &hypotheses,
+                                         Ncv32u &totalDetections,
+                                         Ncv32u totalMaxDetections,
+                                         Ncv32u rectWidth,
+                                         Ncv32u rectHeight,
+                                         Ncv32f curScale,
+                                         cudaStream_t cuStream);
+
+
+NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
+                                       Ncv32u numPixelMaskDetections,
+                                       NCVVector<NcvRect32u> &hypotheses,
+                                       Ncv32u &totalDetections,
+                                       Ncv32u totalMaxDetections,
+                                       Ncv32u rectWidth,
+                                       Ncv32u rectHeight,
+                                       Ncv32f curScale);
+
+
+NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
+                                   Ncv32u &numHypotheses,
+                                   Ncv32u minNeighbors,
+                                   Ncv32f intersectEps,
+                                   NCVVector<Ncv32u> *hypothesesWeights);
+
+
+NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
+                                   Ncv32u &numNodes, Ncv32u &numFeatures);
+
+
+NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
+                                   HaarClassifierCascadeDescriptor &haar,
+                                   NCVVector<HaarStage64> &h_HaarStages,
+                                   NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                   NCVVector<HaarFeature64> &h_HaarFeatures);
+
+
+NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
+                                 HaarClassifierCascadeDescriptor haar,
+                                 NCVVector<HaarStage64> &h_HaarStages,
+                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                 NCVVector<HaarFeature64> &h_HaarFeatures);
+
+
+
+#endif // _ncvhaarobjectdetection_hpp_
--- a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
@@ -0,0 +1,174 @@
+////////////////////////////////////////////////////////////////////////////////
+// The Loki Library
+// Copyright (c) 2001 by Andrei Alexandrescu
+// This code accompanies the book:
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design 
+//     Patterns Applied". Copyright (c) 2001. Addison-Wesley.
+// Permission to use, copy, modify, distribute and sell this software for any 
+//     purpose is hereby granted without fee, provided that the above copyright 
+//     notice appear in all copies and that both that copyright notice and this 
+//     permission notice appear in supporting documentation.
+// The author or Addison-Welsey Longman make no representations about the 
+//     suitability of this software for any purpose. It is provided "as is" 
+//     without express or implied warranty.
+// http://loki-lib.sourceforge.net/index.php?n=Main.License
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+
+#include <stdarg.h>
+#include <vector>
+
+
+namespace Loki
+{
+    //==============================================================================
+    // class NullType
+    // Used as a placeholder for "no type here"
+    // Useful as an end marker in typelists 
+    //==============================================================================
+
+    class NullType {};
+
+    //==============================================================================
+    // class template Typelist
+    // The building block of typelists of any length
+    // Use it through the LOKI_TYPELIST_NN macros
+    // Defines nested types:
+    //     Head (first element, a non-typelist type by convention)
+    //     Tail (second element, can be another typelist)
+    //==============================================================================
+
+    template <class T, class U>
+    struct Typelist
+    {
+        typedef T Head;
+        typedef U Tail;
+    };
+
+    //==============================================================================
+    // class template Int2Type
+    // Converts each integral constant into a unique type
+    // Invocation: Int2Type<v> where v is a compile-time constant integral
+    // Defines 'value', an enum that evaluates to v
+    //==============================================================================
+
+    template <int v>
+    struct Int2Type
+    {
+        enum { value = v };
+    };
+
+    namespace TL
+    {
+        //==============================================================================
+        // class template TypeAt
+        // Finds the type at a given index in a typelist
+        // Invocation (TList is a typelist and index is a compile-time integral 
+        //     constant):
+        // TypeAt<TList, index>::Result
+        // returns the type in position 'index' in TList
+        // If you pass an out-of-bounds index, the result is a compile-time error
+        //==============================================================================
+
+        template <class TList, unsigned int index> struct TypeAt;
+
+        template <class Head, class Tail>
+        struct TypeAt<Typelist<Head, Tail>, 0>
+        {
+            typedef Head Result;
+        };
+
+        template <class Head, class Tail, unsigned int i>
+        struct TypeAt<Typelist<Head, Tail>, i>
+        {
+            typedef typename TypeAt<Tail, i - 1>::Result Result;
+        };
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime boolean template instance dispatcher
+// Cyril Crassin <cyril.crassin@icare3d.org>
+// NVIDIA, 2010
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NCVRuntimeTemplateBool
+{
+    //This struct is used to transform a list of parameters into template arguments
+    //The idea is to build a typelist containing the arguments
+    //and to pass this typelist to a user defined functor
+    template<typename TList, int NumArguments, class Func>
+    struct KernelCaller
+    {
+        //Convenience function used by the user
+        //Takes a variable argument list, transforms it into a list
+        static void call(Func &functor, int dummy, ...)
+        {
+            //Vector used to collect arguments
+            std::vector<int> templateParamList;
+
+            //Variable argument list manipulation
+            va_list listPointer;
+            va_start(listPointer, dummy);
+            //Collect parameters into the list
+            for(int i=0; i<NumArguments; i++)
+            {
+                int val = va_arg(listPointer, int);
+                templateParamList.push_back(val);
+            }
+            va_end(listPointer);
+
+            //Call the actual typelist building function
+            call(functor, templateParamList);
+        }
+
+        //Actual function called recursively to build a typelist based
+        //on a list of values
+        static void call( Func &functor, std::vector<int> &templateParamList)
+        {
+            //Get current parameter value in the list
+            int val = templateParamList[templateParamList.size() - 1];
+            templateParamList.pop_back();
+
+            //Select the compile time value to add into the typelist
+            //depending on the runtime variable and make recursive call. 
+            //Both versions are really instantiated
+            if(val)
+            {
+                KernelCaller<
+                    Loki::Typelist<typename Loki::Int2Type<true>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+            else
+            {
+                KernelCaller< 
+                    Loki::Typelist<typename Loki::Int2Type<false>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+        }
+    };
+
+    //Specialization for 0 value left in the list
+    //-> actual kernel functor call
+    template<class TList, class Func>
+    struct KernelCaller<TList, 0, Func>
+    {
+        static void call(Func &functor)
+        {
+            //Call to the functor's kernel call method
+            functor.call(TList()); //TList instantiated to get the method template parameter resolved
+        }
+
+        static void call(Func &functor, std::vector<int> &templateParams)
+        {
+            functor.call(TList());
+        }
+    };
+}
+
+#endif //_ncvruntimetemplates_hpp_