First version of CascadeClassifier_GPU.

Only for VS2008 now. Sample for it. new NPP_staging for VS2008 only
2011-01-13 13:04:00 +00:00 · 2011-01-13 13:04:00 +00:00 · 1a94186195
commit 1a94186195
parent 31e582e314
17 changed files with 6067 additions and 186 deletions
--- a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
+++ b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib
--- a/3rdparty/NPP_staging/npp_staging.h
+++ b/3rdparty/NPP_staging/npp_staging.h
@ -188,14 +188,14 @@ struct NppStSize32u
 enum NppStStatus
 {
    //already present in NPP
- /*   NPP_SUCCESS                      = 0,   ///< Successful operation (same as NPP_NO_ERROR)
-    NPP_ERROR                        = -1,  ///< Unknown error
-    NPP_CUDA_KERNEL_EXECUTION_ERROR  = -3,  ///< CUDA kernel execution error
-    NPP_NULL_POINTER_ERROR           = -4,  ///< NULL pointer argument error
-    NPP_TEXTURE_BIND_ERROR           = -24, ///< CUDA texture binding error or non-zero offset returned
-    NPP_MEMCPY_ERROR                 = -13, ///< CUDA memory copy error
-    NPP_MEM_ALLOC_ERR                = -12, ///< CUDA memory allocation error
-    NPP_MEMFREE_ERR                  = -15, ///< CUDA memory deallocation error*/
+    //NPP_SUCCESS                      = 0,   ///< Successful operation (same as NPP_NO_ERROR)
+    //NPP_ERROR                        = -1,  ///< Unknown error
+    //NPP_CUDA_KERNEL_EXECUTION_ERROR  = -3,  ///< CUDA kernel execution error
+    //NPP_NULL_POINTER_ERROR           = -4,  ///< NULL pointer argument error
+    //NPP_TEXTURE_BIND_ERROR           = -24, ///< CUDA texture binding error or non-zero offset returned
+    //NPP_MEMCPY_ERROR                 = -13, ///< CUDA memory copy error
+    //NPP_MEM_ALLOC_ERR                = -12, ///< CUDA memory allocation error
+    //NPP_MEMFREE_ERR                  = -15, ///< CUDA memory deallocation error

    //to be added
    NPP_INVALID_ROI,                        ///< Invalid region of interest argument
@ -244,7 +244,7 @@ extern "C" {

 /** \defgroup core_npp NPP Core
 * Basic functions for CUDA streams management.
- * WARNING: These functions couldn't be exported from NPP_staging library, so they can't be used
+ * WARNING: These functions couldn't be exported into DLL, so they can be used only with static version of NPP_staging
 * @{
 */

@ -569,6 +569,13 @@ NppStStatus nppiStTranspose_64f_C1R_host(NppSt64f *h_src, NppSt32u srcStride,
 NppStStatus nppiStIntegralGetSize_8u32u(NppStSize32u roiSize, NppSt32u *pBufsize);


+/**
+ * Calculates the size of the temporary buffer for integral image creation
+ * \see nppiStIntegralGetSize_8u32u
+ */
+NppStStatus nppiStIntegralGetSize_32f32f(NppStSize32u roiSize, NppSt32u *pBufsize);
+
+
 /**
 * Creates an integral image representation for the input image
 *
@ -587,6 +594,15 @@ NppStStatus nppiStIntegral_8u32u_C1R(NppSt8u *d_src, NppSt32u srcStep,
                                     NppSt8u *pBuffer, NppSt32u bufSize);


+/**
+ * Creates an integral image representation for the input image
+ * \see nppiStIntegral_8u32u_C1R
+ */
+NppStStatus nppiStIntegral_32f32f_C1R(NppSt32f *d_src, NppSt32u srcStep,
+                                      NppSt32f *d_dst, NppSt32u dstStep, NppStSize32u roiSize,
+                                      NppSt8u *pBuffer, NppSt32u bufSize);
+
+
 /**
 * Creates an integral image representation for the input image. Host implementation
 *
@ -602,6 +618,14 @@ NppStStatus nppiStIntegral_8u32u_C1R_host(NppSt8u *h_src, NppSt32u srcStep,
                                          NppSt32u *h_dst, NppSt32u dstStep, NppStSize32u roiSize);


+/**
+ * Creates an integral image representation for the input image. Host implementation
+ * \see nppiStIntegral_8u32u_C1R_host
+ */
+NppStStatus nppiStIntegral_32f32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep,
+                                           NppSt32f *h_dst, NppSt32u dstStep, NppStSize32u roiSize);
+
+
 /**
 * Calculates the size of the temporary buffer for squared integral image creation
 *
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@ -35,6 +35,13 @@ source_group("Include" FILES ${lib_hdrs})
 file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})

+if (HAVE_CUDA AND MSVC)
+    file(GLOB ncv_srcs "src/nvidia/*.cpp")
+	file(GLOB ncv_hdrs "src/nvidia/*.h*")
+	file(GLOB ncv_cuda "src/nvidia/*.cu")
+	source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda})
+endif()
+
 if (HAVE_CUDA)		
 	get_filename_component(_path_to_findnpp "${CMAKE_CURRENT_LIST_FILE}" PATH)
 	set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${_path_to_findnpp})	
@ -68,19 +75,16 @@ if (HAVE_CUDA)
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
 		string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
 	endif()
+   

-	CUDA_COMPILE(cuda_objs ${lib_cuda})
+    include(FindNPP_staging.cmake)
+    include_directories(${NPPST_INC})   
+    
+	CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
 	#CUDA_BUILD_CLEAN_TARGET()
 endif()

-
-add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${cuda_objs})
-
-IF (HAVE_CUDA)
-    include(FindNPP_staging.cmake)    
-    include_directories(${NPPST_INC})       
-    target_link_libraries(${the_target} ${NPPST_LIB})
-endif()
+add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})

 if(PCHSupport_FOUND)
 	set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp)
@ -114,6 +118,7 @@ target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} )

 if (HAVE_CUDA)
 	target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES})
+    target_link_libraries(${the_target} ${NPPST_LIB})
    CUDA_ADD_CUFFT_TO_TARGET(${the_target})
 endif()

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -1380,87 +1380,39 @@ namespace cv
            explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}
        };

-        ////////////////////////////////// CascadeClassifier //////////////////////////////////////////
+        ////////////////////////////////// CascadeClassifier_GPU //////////////////////////////////////////
        // The cascade classifier class for object detection.
-        class CV_EXPORTS CascadeClassifier
+        class CV_EXPORTS CascadeClassifier_GPU
        {
-        public:
-            struct CV_EXPORTS DTreeNode
-            {
-                int featureIdx;
-                float threshold; // for ordered features only
-                int left;
-                int right;
-            };
-
-            struct CV_EXPORTS DTree
-            {
-                int nodeCount;
-            };
-
-            struct CV_EXPORTS Stage
-            {
-                int first;
-                int ntrees;
-                float threshold;
-            };
-
-            enum { BOOST = 0 };
-            enum { DO_CANNY_PRUNING = 1, SCALE_IMAGE = 2,FIND_BIGGEST_OBJECT = 4, DO_ROUGH_SEARCH = 8 };
-
-            CascadeClassifier();
-            CascadeClassifier(const string& filename);
-            ~CascadeClassifier();
+        public:            
+            CascadeClassifier_GPU();
+            CascadeClassifier_GPU(const string& filename);
+            ~CascadeClassifier_GPU();

            bool empty() const;
            bool load(const string& filename);
-            bool read(const FileNode& node);
+            void release();
+            
+            /* returns number of detected objects */
+            int detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor=1.2, int minNeighbors=4, Size minSize=Size());
+                                    
+            bool findLargestObject;
+            bool visualizeInPlace;

-            void detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor=1.1,
-                int minNeighbors=3, int flags=0, Size minSize=Size(), Size maxSize=Size());
-
-            bool setImage( Ptr<FeatureEvaluator>&, const Mat& );
-            int runAt( Ptr<FeatureEvaluator>&, Point );
-
-            bool isStumpBased;
-
-            int stageType;
-            int featureType;
-            int ncategories;
-            Size origWinSize;
-
-            vector<Stage> stages;
-            vector<DTree> classifiers;
-            vector<DTreeNode> nodes;
-            vector<float> leaves;
-            vector<int> subsets;
-
-            Ptr<FeatureEvaluator> feval;
-            Ptr<CvHaarClassifierCascade> oldCascade;
+            Size getClassifierSize() const;
+        private:
+            
+            struct CascadeClassifierImpl;                        
+            CascadeClassifierImpl* impl;            
        };
-
+        
        ////////////////////////////////// SURF //////////////////////////////////////////
        
        struct CV_EXPORTS SURFParams_GPU 
        {
-            SURFParams_GPU() :
-                threshold(0.1f), 
-                nOctaves(4),
-                nIntervals(4),
-                initialScale(2.f),
-
-                l1(3.f/1.5f),
-                l2(5.f/1.5f),
-                l3(3.f/1.5f),
-                l4(1.f/1.5f),
-                edgeScale(0.81f),
-                initialStep(1),
-
-                extended(true),
-
-                featuresRatio(0.01f)
-            {
-            }
+            SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), initialScale(2.f), 
+                l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), l4(1.f/1.5f),
+                edgeScale(0.81f), initialStep(1), extended(true), featuresRatio(0.01f) {}

            //! The interest operator threshold
            float threshold;
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -170,8 +170,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)

    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,
-            nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, nppLut.ptr<Npp32s>(), lvls.pLevels, 256) );
    }
    else
    {
@ -186,8 +185,7 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst)
            pValues3[1] = nppLut3[1].ptr<Npp32s>();
            pValues3[2] = nppLut3[2].ptr<Npp32s>();
        }
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz,
-            pValues3, lvls.pLevels3, lvls.nValues3) );
+        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), src.step, dst.ptr<Npp8u>(), dst.step, sz, pValues3, lvls.pLevels3, lvls.nValues3) );
    }
 }

--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -42,69 +42,751 @@

 #include "precomp.hpp"

-
-
-
 using namespace cv;
 using namespace cv::gpu;
 using namespace std;

-#if !defined (HAVE_CUDA)

-cv::gpu::CascadeClassifier::CascadeClassifier()  { throw_nogpu(); }
-cv::gpu::CascadeClassifier::CascadeClassifier(const string&)  { throw_nogpu(); }
-cv::gpu::CascadeClassifier::~CascadeClassifier()  { throw_nogpu(); }
+#if !defined (HAVE_CUDA) || (defined(_MSC_VER) && _MSC_VER != 1500) || !defined(_MSC_VER)

-bool cv::gpu::CascadeClassifier::empty() const { throw_nogpu(); return true; }
-bool cv::gpu::CascadeClassifier::load(const string& filename)  { throw_nogpu(); return true; }
-bool cv::gpu::CascadeClassifier::read(const FileNode& node)  { throw_nogpu(); return true; }
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()  { throw_nogpu(); }
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string&)  { throw_nogpu(); }
+cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU()  { throw_nogpu(); }

-void cv::gpu::CascadeClassifier::detectMultiScale( const Mat&, vector<Rect>&, double, int, int, Size, Size) { throw_nogpu(); }
+bool cv::gpu::CascadeClassifier_GPU::empty() const { throw_nogpu(); return true; }
+bool cv::gpu::CascadeClassifier_GPU::load(const string&)  { throw_nogpu(); return true; }
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const { throw_nogpu(); return Size(); }

-       
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& , GpuMat& , double , int , Size)  { throw_nogpu(); return 0; }

+#if defined (HAVE_CUDA)
+	NCVStatus loadFromXML(const string&, HaarClassifierCascadeDescriptor&, vector<HaarStage64>&, 
+						  vector<HaarClassifierNode128>&, vector<HaarFeature64>&) { throw_nogpu(); return NCVStatus(); }

+	void groupRectangles(vector<NcvRect32u>&, int, double, vector<Ncv32u>*) { throw_nogpu(); }
+#endif

 #else

+struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
+{    
+    CascadeClassifierImpl(const string& filename) : lastAllocatedFrameSize(-1, -1)
+    {
+        ncvSetDebugOutputHandler(NCVDebugOutputHandler);            
+        if (ncvStat != load(filename))
+            CV_Error(CV_GpuApiCallError, "Error in GPU cacade load");
+    }    
+    NCVStatus process(const GpuMat& src, GpuMat& objects, float scaleStep, int minNeighbors, bool findLargestObject, bool visualizeInPlace, NcvSize32u ncvMinSize, /*out*/unsigned int& numDetections)
+    {   
+        calculateMemReqsAndAllocate(src.size());        

-cv::gpu::CascadeClassifier::CascadeClassifier()
-{
+        NCVMemPtr src_beg;
+        src_beg.ptr = (void*)src.ptr<Ncv8u>();
+        src_beg.memtype = NCVMemoryTypeDevice;

-}
+        NCVMemSegment src_seg;
+        src_seg.begin = src_beg;
+        src_seg.size  = src.step * src.rows;

-cv::gpu::CascadeClassifier::CascadeClassifier(const string& filename)
-{
+        NCVMatrixReuse<Ncv8u> d_src(src_seg, devProp.textureAlignment, src.cols, src.rows, src.step, true);        
+        
+        //NCVMatrixAlloc<Ncv8u> d_src(*gpuAllocator, src.cols, src.rows);
+        //ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-}
+        //NCVMatrixAlloc<Ncv8u> h_src(*cpuAllocator, src.cols, src.rows);
+        //ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);

-cv::gpu::CascadeClassifier::~CascadeClassifier()
-{
+        CV_Assert(objects.rows == 1);
+
+        NCVMemPtr objects_beg;
+        objects_beg.ptr = (void*)objects.ptr<NcvRect32u>();
+        objects_beg.memtype = NCVMemoryTypeDevice;
+
+        NCVMemSegment objects_seg;
+        objects_seg.begin = objects_beg;
+        objects_seg.size = objects.step * objects.rows;
+        NCVVectorReuse<NcvRect32u> d_rects(objects_seg, objects.cols);
+        //NCVVectorAlloc<NcvRect32u> d_rects(*gpuAllocator, 100);        
+        //ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);        
+            
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+
+        Ncv32u flags = 0;
+        flags |= findLargestObject? NCVPipeObjDet_FindLargestObject : 0;
+        flags |= visualizeInPlace ? NCVPipeObjDet_VisualizeInPlace  : 0;
+        
+        ncvStat = ncvDetectObjectsMultiScale_device(
+            d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures,
+            ncvMinSize,
+            minNeighbors,
+            scaleStep, 1,
+            flags,
+            *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor,  0);
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+                       
+        return NCV_SUCCESS;
+    }
+    ////
    
+    NcvSize32u getClassifierSize() const  { return haar.ClassifierSize; }
+    cv::Size getClassifierCvSize() const { return cv::Size(haar.ClassifierSize.width, haar.ClassifierSize.height); }
+private:
+
+    static void NCVDebugOutputHandler(const char* msg) { CV_Error(CV_GpuApiCallError, msg); }
+
+    NCVStatus load(const string& classifierFile)
+    {        
+        int devId = cv::gpu::getDevice();           
+        ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR);
+
+        // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator
+        gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice);        
+        cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned);
+
+        ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR);
+
+        Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
+        ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", NCV_FILE_ERROR);
+
+        h_haarStages   = new NCVVectorAlloc<HaarStage64>(*cpuCascadeAllocator, haarNumStages);        
+        h_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*cpuCascadeAllocator, haarNumNodes);
+        h_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*cpuCascadeAllocator, haarNumFeatures);
+
+        ncvAssertPrintReturn(h_haarStages->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(h_haarNodes->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);        
+        ncvAssertPrintReturn(h_haarFeatures->isMemAllocated(), "Error in cascade CPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, *h_haarStages, *h_haarNodes, *h_haarFeatures);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", NCV_FILE_ERROR);
+
+        d_haarStages   = new NCVVectorAlloc<HaarStage64>(*gpuCascadeAllocator, haarNumStages);
+        d_haarNodes    = new NCVVectorAlloc<HaarClassifierNode128>(*gpuCascadeAllocator, haarNumNodes);
+        d_haarFeatures = new NCVVectorAlloc<HaarFeature64>(*gpuCascadeAllocator, haarNumFeatures);
+
+        ncvAssertPrintReturn(d_haarStages->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(d_haarNodes->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);                        
+        ncvAssertPrintReturn(d_haarFeatures->isMemAllocated(), "Error in cascade GPU allocator", NCV_CUDA_ERROR);
+
+        ncvStat = h_haarStages->copySolid(*d_haarStages, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarNodes->copySolid(*d_haarNodes, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);
+        ncvStat = h_haarFeatures->copySolid(*d_haarFeatures, 0);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", NCV_CUDA_ERROR);    
+
+        return NCV_SUCCESS;
+    }
+    ////
+
+    NCVStatus calculateMemReqsAndAllocate(const Size& frameSize)
+    {        
+        if (lastAllocatedFrameSize == frameSize)
+            return NCV_SUCCESS;
+
+        // Calculate memory requirements and create real allocators
+        NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+        NCVMemStackAllocator cpuCounter(devProp.textureAlignment);
+
+        ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", NCV_CUDA_ERROR);        
+        ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", NCV_CUDA_ERROR);
+        
+        NCVMatrixAlloc<Ncv8u> d_src(gpuCounter, frameSize.width, frameSize.height);
+        NCVMatrixAlloc<Ncv8u> h_src(cpuCounter, frameSize.width, frameSize.height);
+
+        ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);        
+        ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NCVVectorAlloc<NcvRect32u> d_rects(gpuCounter, 100);        
+        ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+        NcvSize32u roi;
+        roi.width = d_src.width();
+        roi.height = d_src.height();
+        Ncv32u numDetections;
+        ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages,
+            *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0);
+
+        ncvAssertReturnNcvStat(ncvStat);
+        ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+                      
+        gpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);        
+        cpuAllocator = new NCVMemStackAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);
+
+        ncvAssertPrintReturn(gpuAllocator->isInitialized(), "Error creating GPU memory allocator", NCV_CUDA_ERROR);
+        ncvAssertPrintReturn(cpuAllocator->isInitialized(), "Error creating CPU memory allocator", NCV_CUDA_ERROR);        
+        return NCV_SUCCESS;
+    }
+    //// 
+
+    cudaDeviceProp devProp;
+    NCVStatus ncvStat;
+
+    Ptr<NCVMemNativeAllocator> gpuCascadeAllocator;        
+    Ptr<NCVMemNativeAllocator> cpuCascadeAllocator;
+
+    Ptr<NCVVectorAlloc<HaarStage64> >           h_haarStages;        
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > h_haarNodes;
+    Ptr<NCVVectorAlloc<HaarFeature64> >         h_haarFeatures;
+
+    HaarClassifierCascadeDescriptor haar;
+
+    Ptr<NCVVectorAlloc<HaarStage64> >           d_haarStages;
+    Ptr<NCVVectorAlloc<HaarClassifierNode128> > d_haarNodes;
+    Ptr<NCVVectorAlloc<HaarFeature64> >         d_haarFeatures;
+
+    Size lastAllocatedFrameSize;
+
+    Ptr<NCVMemStackAllocator> gpuAllocator;        
+    Ptr<NCVMemStackAllocator> cpuAllocator;
+};
+
+
+
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU() : findLargestObject(false), visualizeInPlace(false), impl(0) {}
+cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU(const string& filename) : findLargestObject(false), visualizeInPlace(false), impl(0) { load(filename); }
+cv::gpu::CascadeClassifier_GPU::~CascadeClassifier_GPU() { release(); }
+bool cv::gpu::CascadeClassifier_GPU::empty() const { return impl == 0; }
+
+void cv::gpu::CascadeClassifier_GPU::release() { if (impl) { delete impl; impl = 0; } }
+
+bool cv::gpu::CascadeClassifier_GPU::load(const string& filename)
+{         
+    release();
+    impl = new CascadeClassifierImpl(filename);
+    return !this->empty();    
 }

-bool cv::gpu::CascadeClassifier::empty() const
+Size cv::gpu::CascadeClassifier_GPU::getClassifierSize() const
 {
-    int *a = (int*)&nppiStTranspose_32u_C1R;
-    return *a == 0xFFFFF;
-    return true;
+    return this->empty() ? Size() : impl->getClassifierCvSize();
+}
+                            
+int cv::gpu::CascadeClassifier_GPU::detectMultiScale( const GpuMat& image, GpuMat& objectsBuf, double scaleFactor, int minNeighbors, Size minSize)
+{     
+    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U);
+    CV_Assert( !this->empty());
+        
+    const int defaultObjSearchNum = 100;
+    if (objectsBuf.empty())
+        objectsBuf.create(1, defaultObjSearchNum, DataType<Rect>::type);
+    
+    NcvSize32u ncvMinSize = impl->getClassifierSize();
+
+    if (ncvMinSize.width < (unsigned)minSize.width && ncvMinSize.height < (unsigned)minSize.height)
+    {
+        ncvMinSize.width = minSize.width;
+        ncvMinSize.height = minSize.height;
+    }    
+                
+    unsigned int numDetections;
+    NCVStatus ncvStat = impl->process(image, objectsBuf, (float)scaleFactor, minNeighbors, findLargestObject, visualizeInPlace, ncvMinSize, numDetections);                 
+    if (ncvStat != NCV_SUCCESS)
+        CV_Error(CV_GpuApiCallError, "Error in face detectioln");
+
+    return numDetections;
 }

-bool cv::gpu::CascadeClassifier::load(const string& filename)
+	struct RectConvert
+	{
+		Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); }
+		NcvRect32u operator()(const Rect& nr) const 
+		{ 
+			NcvRect32u rect;
+			rect.x = nr.x;
+			rect.y = nr.y;
+			rect.width = nr.width;
+			rect.height = nr.height;
+			return rect; 
+		}
+	};
+
+	void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)
+	{
+		vector<Rect> rects(hypotheses.size());    
+		std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());
+	    
+		if (weights) 
+		{
+			vector<int> weights_int;
+			weights_int.assign(weights->begin(), weights->end());        
+			cv::groupRectangles(rects, weights_int, groupThreshold, eps);
+		}
+		else
+		{   
+			cv::groupRectangles(rects, groupThreshold, eps);
+		}
+		std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());    
+		hypotheses.resize(rects.size());
+	}
+
+
+#if 1 /* loadFromXML implementation switch */
+
+NCVStatus loadFromXML(const std::string &filename, 
+                      HaarClassifierCascadeDescriptor &haar, 
+                      std::vector<HaarStage64> &haarStages, 
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes, 
+                      std::vector<HaarFeature64> &haarFeatures)
 {
-    return true;
+    NCVStatus ncvStat;
+
+    haar.NumStages = 0;
+    haar.NumClassifierRootNodes = 0;
+    haar.NumClassifierTotalNodes = 0;
+    haar.NumFeatures = 0;
+    haar.ClassifierSize.width = 0;
+    haar.ClassifierSize.height = 0;        
+    haar.bHasStumpsOnly = true;
+    haar.bNeedsTiltedII = false;
+    Ncv32u curMaxTreeDepth;
+
+    std::vector<char> xmlFileCont;   
+
+    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
+    haarStages.resize(0);
+    haarClassifierNodes.resize(0);
+    haarFeatures.resize(0);    
+    
+    Ptr<CvHaarClassifierCascade> oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0);
+    if (oldCascade.empty())
+        return NCV_HAAR_XML_LOADING_EXCEPTION;
+    
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+                                          
+    haar.ClassifierSize.width = oldCascade->orig_window_size.width;
+    haar.ClassifierSize.height = oldCascade->orig_window_size.height;
+
+    int stagesCound = oldCascade->count;
+    for(int s = 0; s < stagesCound; ++s) // by stages
+    {
+        HaarStage64 curStage;
+        curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size());
+
+        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
+
+        int treesCount = oldCascade->stage_classifier[s].count;
+        for(int t = 0; t < treesCount; ++t) // bytrees
+        {                                
+            Ncv32u nodeId = 0;
+            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
+
+            int nodesCount = tree->count;
+            for(int n = 0; n < nodesCount; ++n)  //by features             
+            {   
+                CvHaarFeature* feature = &tree->haar_feature[n];
+
+                HaarClassifierNode128 curNode;                                        
+                curNode.setThreshold(tree->threshold[n]);
+                
+                HaarClassifierNodeDescriptor32 nodeLeft;
+                if ( tree->left[n] <= 0 )
+                {   
+                    Ncv32f leftVal = tree->alpha[-tree->left[n]];
+                    ncvStat = nodeLeft.create(leftVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);                    
+                }
+                else
+                {   
+                    Ncv32u leftNodeOffset = tree->left[n];                        
+                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setLeftNodeDesc(nodeLeft);
+                
+                HaarClassifierNodeDescriptor32 nodeRight;
+                if ( tree->right[n] <= 0 )
+                {                                         
+                    Ncv32f rightVal = tree->alpha[-tree->right[n]];                        
+                    ncvStat = nodeRight.create(rightVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                }
+                else
+                {                                               
+                    Ncv32u rightNodeOffset = tree->right[n];                        
+                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setRightNodeDesc(nodeRight);                    
+
+                Ncv32u tiltedVal = feature->tilted;
+                haar.bNeedsTiltedII = (tiltedVal != 0);                
+
+                Ncv32u featureId = 0;                    
+                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
+                {                        
+                    Ncv32u rectX = feature->rect[l].r.x; 
+                    Ncv32u rectY = feature->rect[l].r.y;
+                    Ncv32u rectWidth = feature->rect[l].r.width;
+                    Ncv32u rectHeight = feature->rect[l].r.height;
+
+                    Ncv32f rectWeight = feature->rect[l].weight;
+
+                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
+                        break;
+
+                    HaarFeature64 curFeature;
+                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
+                    curFeature.setWeight(rectWeight);
+                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                    haarFeatures.push_back(curFeature);
+
+                    featureId++;
+                }
+
+                HaarFeatureDescriptor32 tmpFeatureDesc;
+                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId);
+                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                curNode.setFeatureDesc(tmpFeatureDesc);
+
+                if (!nodeId)
+                {
+                    //root node
+                    haarClassifierNodes.push_back(curNode);
+                    curMaxTreeDepth = 1;
+                }
+                else
+                {
+                    //other node
+                    h_TmpClassifierNotRootNodes.push_back(curNode);
+                    curMaxTreeDepth++;
+                }
+
+                nodeId++;
+            }               
+        }
+
+        curStage.setNumClassifierRootNodes(treesCount);
+        haarStages.push_back(curStage);            
+    }
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+    //fill in cascade stats
+    haar.NumStages = haarStages.size();
+    haar.NumClassifierRootNodes = haarClassifierNodes.size();
+    haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size();
+    haar.NumFeatures = haarFeatures.size();
+
+    //merge root and leaf nodes in one classifiers array
+    Ncv32u offsetRoot = haarClassifierNodes.size();
+    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
+    {
+        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
+        if (!nodeLeft.isLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
+        if (!nodeRight.isLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
+    }
+    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
+    {
+        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
+        if (!nodeLeft.isLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
+        if (!nodeRight.isLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
+
+        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
+    }
+
+    return NCV_SUCCESS;
 }

-bool cv::gpu::CascadeClassifier::read(const FileNode& node)
+////
+
+#else /* loadFromXML implementation switch */
+
+#include "e:/devNPP-OpenCV/src/external/_rapidxml-1.13/rapidxml.hpp"
+
+NCVStatus loadFromXML(const std::string &filename,
+                      HaarClassifierCascadeDescriptor &haar,
+                      std::vector<HaarStage64> &haarStages,
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes,
+                      std::vector<HaarFeature64> &haarFeatures)
 {
-    return true;
+    NCVStatus ncvStat;
+
+    haar.NumStages = 0;
+    haar.NumClassifierRootNodes = 0;
+    haar.NumClassifierTotalNodes = 0;
+    haar.NumFeatures = 0;
+    haar.ClassifierSize.width = 0;
+    haar.ClassifierSize.height = 0;
+    haar.bNeedsTiltedII = false;
+    haar.bHasStumpsOnly = false;
+
+    FILE *fp;
+    fopen_s(&fp, filename.c_str(), "r");
+    ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
+
+    //get file size
+    fseek(fp, 0, SEEK_END);
+    Ncv32u xmlSize = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    //load file to vector
+    std::vector<char> xmlFileCont;
+    xmlFileCont.resize(xmlSize+1);
+    memset(&xmlFileCont[0], 0, xmlSize+1);
+    fread_s(&xmlFileCont[0], xmlSize, 1, xmlSize, fp);
+    fclose(fp);
+
+    haar.bHasStumpsOnly = true;
+    haar.bNeedsTiltedII = false;
+    Ncv32u curMaxTreeDepth;
+
+    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
+    haarStages.resize(0);
+    haarClassifierNodes.resize(0);
+    haarFeatures.resize(0);
+
+    //XML loading and OpenCV XML classifier syntax verification
+    try
+    {
+        rapidxml::xml_document<> doc;
+        doc.parse<0>(&xmlFileCont[0]);
+
+        //opencv_storage
+        rapidxml::xml_node<> *parserGlobal = doc.first_node();
+        ncvAssertReturn(!strcmp(parserGlobal->name(), "opencv_storage"), NCV_HAAR_XML_LOADING_EXCEPTION);
+
+        //classifier type
+        parserGlobal = parserGlobal->first_node();
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);
+        rapidxml::xml_attribute<> *attr = parserGlobal->first_attribute("type_id");
+        ncvAssertReturn(!strcmp(attr->value(), "opencv-haar-classifier"), NCV_HAAR_XML_LOADING_EXCEPTION);
+
+        //classifier size
+        parserGlobal = parserGlobal->first_node("size");
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);
+        sscanf_s(parserGlobal->value(), "%d %d", &(haar.ClassifierSize.width), &(haar.ClassifierSize.height));
+
+        //parse stages
+        parserGlobal = parserGlobal->next_sibling("stages");
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);
+        parserGlobal = parserGlobal->first_node("_");
+        ncvAssertReturn(parserGlobal, NCV_HAAR_XML_LOADING_EXCEPTION);
+
+        while (parserGlobal)
+        {
+            HaarStage64 curStage;
+            curStage.setStartClassifierRootNodeOffset(haarClassifierNodes.size());
+            Ncv32u tmpNumClassifierRootNodes = 0;
+
+            rapidxml::xml_node<> *parserStageThreshold = parserGlobal->first_node("stage_threshold");
+            ncvAssertReturn(parserStageThreshold, NCV_HAAR_XML_LOADING_EXCEPTION);
+            Ncv32f tmpStageThreshold;
+            sscanf_s(parserStageThreshold->value(), "%f", &tmpStageThreshold);
+            curStage.setStageThreshold(tmpStageThreshold);
+
+            //parse trees
+            rapidxml::xml_node<> *parserTree;
+            parserTree = parserGlobal->first_node("trees");
+            ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION);
+            parserTree = parserTree->first_node("_");
+            ncvAssertReturn(parserTree, NCV_HAAR_XML_LOADING_EXCEPTION);
+
+            while (parserTree)
+            {
+                rapidxml::xml_node<> *parserNode;
+                parserNode = parserTree->first_node("_");
+                ncvAssertReturn(parserNode, NCV_HAAR_XML_LOADING_EXCEPTION);
+                Ncv32u nodeId = 0;
+
+                while (parserNode)
+                {
+                    HaarClassifierNode128 curNode;
+
+                    rapidxml::xml_node<> *parserNodeThreshold = parserNode->first_node("threshold");
+                    ncvAssertReturn(parserNodeThreshold, NCV_HAAR_XML_LOADING_EXCEPTION);
+                    Ncv32f tmpThreshold;
+                    sscanf_s(parserNodeThreshold->value(), "%f", &tmpThreshold);
+                    curNode.setThreshold(tmpThreshold);
+
+                    rapidxml::xml_node<> *parserNodeLeft = parserNode->first_node("left_val");
+                    HaarClassifierNodeDescriptor32 nodeLeft;
+                    if (parserNodeLeft)
+                    {
+                        Ncv32f leftVal;
+                        sscanf_s(parserNodeLeft->value(), "%f", &leftVal);
+                        ncvStat = nodeLeft.create(leftVal);
+                        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    }
+                    else
+                    {
+                        parserNodeLeft = parserNode->first_node("left_node");
+                        ncvAssertReturn(parserNodeLeft, NCV_HAAR_XML_LOADING_EXCEPTION);
+                        Ncv32u leftNodeOffset;
+                        sscanf_s(parserNodeLeft->value(), "%d", &leftNodeOffset);
+                        nodeLeft.create(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1);
+                        haar.bHasStumpsOnly = false;
+                    }
+                    curNode.setLeftNodeDesc(nodeLeft);
+
+                    rapidxml::xml_node<> *parserNodeRight = parserNode->first_node("right_val");
+                    HaarClassifierNodeDescriptor32 nodeRight;
+                    if (parserNodeRight)
+                    {
+                        Ncv32f rightVal;
+                        sscanf_s(parserNodeRight->value(), "%f", &rightVal);
+                        ncvStat = nodeRight.create(rightVal);
+                        ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    }
+                    else
+                    {
+                        parserNodeRight = parserNode->first_node("right_node");
+                        ncvAssertReturn(parserNodeRight, NCV_HAAR_XML_LOADING_EXCEPTION);
+                        Ncv32u rightNodeOffset;
+                        sscanf_s(parserNodeRight->value(), "%d", &rightNodeOffset);
+                        nodeRight.create(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1);
+                        haar.bHasStumpsOnly = false;
+                    }
+                    curNode.setRightNodeDesc(nodeRight);
+
+                    rapidxml::xml_node<> *parserNodeFeatures = parserNode->first_node("feature");
+                    ncvAssertReturn(parserNodeFeatures, NCV_HAAR_XML_LOADING_EXCEPTION);
+
+                    rapidxml::xml_node<> *parserNodeFeaturesTilted = parserNodeFeatures->first_node("tilted");
+                    ncvAssertReturn(parserNodeFeaturesTilted, NCV_HAAR_XML_LOADING_EXCEPTION);
+                    Ncv32u tiltedVal;
+                    sscanf_s(parserNodeFeaturesTilted->value(), "%d", &tiltedVal);
+                    haar.bNeedsTiltedII = (tiltedVal != 0);
+
+                    rapidxml::xml_node<> *parserNodeFeaturesRects = parserNodeFeatures->first_node("rects");
+                    ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION);
+                    parserNodeFeaturesRects = parserNodeFeaturesRects->first_node("_");
+                    ncvAssertReturn(parserNodeFeaturesRects, NCV_HAAR_XML_LOADING_EXCEPTION);
+                    Ncv32u featureId = 0;
+
+                    while (parserNodeFeaturesRects)
+                    {
+                        Ncv32u rectX, rectY, rectWidth, rectHeight;
+                        Ncv32f rectWeight;
+                        sscanf_s(parserNodeFeaturesRects->value(), "%d %d %d %d %f", &rectX, &rectY, &rectWidth, &rectHeight, &rectWeight);
+                        HaarFeature64 curFeature;
+                        ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
+                        curFeature.setWeight(rectWeight);
+                        ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                        haarFeatures.push_back(curFeature);
+
+                        parserNodeFeaturesRects = parserNodeFeaturesRects->next_sibling("_");
+                        featureId++;
+                    }
+
+                    HaarFeatureDescriptor32 tmpFeatureDesc;
+                    ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, featureId, haarFeatures.size() - featureId);
+                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                    curNode.setFeatureDesc(tmpFeatureDesc);
+
+                    if (!nodeId)
+                    {
+                        //root node
+                        haarClassifierNodes.push_back(curNode);
+                        curMaxTreeDepth = 1;
+                    }
+                    else
+                    {
+                        //other node
+                        h_TmpClassifierNotRootNodes.push_back(curNode);
+                        curMaxTreeDepth++;
+                    }
+
+                    parserNode = parserNode->next_sibling("_");
+                    nodeId++;
+                }
+
+                parserTree = parserTree->next_sibling("_");
+                tmpNumClassifierRootNodes++;
+            }
+
+            curStage.setNumClassifierRootNodes(tmpNumClassifierRootNodes);
+            haarStages.push_back(curStage);
+
+            parserGlobal = parserGlobal->next_sibling("_");
+        }
+    }
+    catch (...)
+    {
+        return NCV_HAAR_XML_LOADING_EXCEPTION;
+    }
+
+    //fill in cascade stats
+    haar.NumStages = haarStages.size();
+    haar.NumClassifierRootNodes = haarClassifierNodes.size();
+    haar.NumClassifierTotalNodes = haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size();
+    haar.NumFeatures = haarFeatures.size();
+
+    //merge root and leaf nodes in one classifiers array
+    Ncv32u offsetRoot = haarClassifierNodes.size();
+    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
+    {
+        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
+        if (!nodeLeft.isLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
+        if (!nodeRight.isLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
+    }
+    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
+    {
+        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
+        if (!nodeLeft.isLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
+        if (!nodeRight.isLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
+
+        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
+    }
+
+    return NCV_SUCCESS;
 }

-void cv::gpu::CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& objects, double scaleFactor, 
-    int minNeighbors, int flags, Size minSize, Size maxSize)
+#endif /* loadFromXML implementation switch */

-{
+#endif /* HAVE_CUDA */

-}

-#endif
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@ -62,7 +62,7 @@ namespace cv
            BORDER_REPLICATE_GPU,
            BORDER_CONSTANT_GPU
        };
-        
+                
        // Converts CPU border extrapolation mode into GPU internal analogue.
        // Returns true if the GPU analogue exists, false otherwise.
        bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
@ -105,8 +105,28 @@ namespace cv
            const textureReference* tex; 
            cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
            cudaSafeCall( cudaUnbindTexture(tex) );
-        }        
+        }    

+        struct KeyPoint_GPU
+        {
+            float x;
+            float y;
+            float size;
+            float response;
+            float angle;
+            float octave;
+        };
+
+        enum KeypointLayout 
+        {
+            SF_X,
+            SF_Y,
+            SF_SIZE,
+            SF_RESPONSE,
+            SF_ANGLE,
+            SF_OCTAVE,
+            SF_FEATURE_STRIDE
+        };
    }
 }

--- a/modules/gpu/src/cuda/surf_key_point.h
+++ b/modules/gpu/src/cuda/surf_key_point.h
@ -47,29 +47,7 @@ namespace cv
 {
    namespace gpu
    {
-        namespace surf
-        {
-            struct KeyPoint_GPU
-            {
-                float x;
-                float y;
-                float size;
-                float response;
-                float angle;
-                float octave;
-            };
-
-            enum KeypointLayout 
-            {
-                SF_X,
-                SF_Y,
-                SF_SIZE,
-                SF_RESPONSE,
-                SF_ANGLE,
-                SF_OCTAVE,
-                SF_FEATURE_STRIDE
-            };
-        }
+       
    }
 }

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -82,21 +82,16 @@ void cv::gpu::max(const GpuMat&, double, GpuMat&, const Stream&) { throw_nogpu()

 namespace
 {
-    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep,
-                                         NppiSize oSizeROI, int nScaleFactor);
-    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst,
-                                          int nDstStep, NppiSize oSizeROI);
-    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst,
-                                          int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*npp_arithm_8u_t)(const Npp8u* pSrc1, int nSrc1Step, const Npp8u* pSrc2, int nSrc2Step, Npp8u* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
+    typedef NppStatus (*npp_arithm_32s_t)(const Npp32s* pSrc1, int nSrc1Step, const Npp32s* pSrc2, int nSrc2Step, Npp32s* pDst, int nDstStep, NppiSize oSizeROI);
+    typedef NppStatus (*npp_arithm_32f_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);

    void nppArithmCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst,
                         npp_arithm_8u_t npp_func_8uc1, npp_arithm_8u_t npp_func_8uc4,
                         npp_arithm_32s_t npp_func_32sc1, npp_arithm_32f_t npp_func_32fc1)
    {
        CV_DbgAssert(src1.size() == src2.size() && src1.type() == src2.type());
-
        CV_Assert(src1.type() == CV_8UC1 || src1.type() == CV_8UC4 || src1.type() == CV_32SC1 || src1.type() == CV_32FC1);
-
        dst.create( src1.size(), src1.type() );

        NppiSize sz;
@ -106,24 +101,16 @@ namespace
        switch (src1.type())
        {
        case CV_8UC1:
-            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            nppSafeCall( npp_func_8uc1(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );
            break;
        case CV_8UC4:
-            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step,
-                src2.ptr<Npp8u>(), src2.step,
-                dst.ptr<Npp8u>(), dst.step, sz, 0) );
+            nppSafeCall( npp_func_8uc4(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz, 0) );
            break;
        case CV_32SC1:
-            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step,
-                src2.ptr<Npp32s>(), src2.step,
-                dst.ptr<Npp32s>(), dst.step, sz) );
+            nppSafeCall( npp_func_32sc1(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );
            break;
        case CV_32FC1:
-            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step,
-                src2.ptr<Npp32f>(), src2.step,
-                dst.ptr<Npp32f>(), dst.step, sz) );
+            nppSafeCall( npp_func_32fc1(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );
            break;
        default:
            CV_Assert(!"Unsupported source type");
@ -133,16 +120,15 @@ namespace
    template<int SCN> struct NppArithmScalarFunc;
    template<> struct NppArithmScalarFunc<1>
    {
-        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst,
-                                      int nDstStep, NppiSize oSizeROI);
+        typedef NppStatus (*func_ptr)(const Npp32f *pSrc, int nSrcStep, Npp32f nValue, Npp32f *pDst, int nDstStep, NppiSize oSizeROI);
    };
    template<> struct NppArithmScalarFunc<2>
    {
-        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst,
-                                      int nDstStep, NppiSize oSizeROI);
+        typedef NppStatus (*func_ptr)(const Npp32fc *pSrc, int nSrcStep, Npp32fc nValue, Npp32fc *pDst, int nDstStep, NppiSize oSizeROI);
    };

    template<int SCN, typename NppArithmScalarFunc<SCN>::func_ptr func> struct NppArithmScalar;
+
    template<typename NppArithmScalarFunc<1>::func_ptr func> struct NppArithmScalar<1, func>
    {
        static void calc(const GpuMat& src, const Scalar& sc, GpuMat& dst)
@ -254,24 +240,16 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
    switch (src1.type())
    {
    case CV_8UC1:
-        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step,
-            src2.ptr<Npp8u>(), src2.step,
-            dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_8u_C1R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );
        break;
    case CV_8UC4:
-        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step,
-            src2.ptr<Npp8u>(), src2.step,
-            dst.ptr<Npp8u>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_8u_C4R(src1.ptr<Npp8u>(), src1.step, src2.ptr<Npp8u>(), src2.step, dst.ptr<Npp8u>(), dst.step, sz) );
        break;
    case CV_32SC1:
-        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step,
-            src2.ptr<Npp32s>(), src2.step,
-            dst.ptr<Npp32s>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_32s_C1R(src1.ptr<Npp32s>(), src1.step, src2.ptr<Npp32s>(), src2.step, dst.ptr<Npp32s>(), dst.step, sz) );
        break;
    case CV_32FC1:
-        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step,
-            src2.ptr<Npp32f>(), src2.step,
-            dst.ptr<Npp32f>(), dst.step, sz) );
+        nppSafeCall( nppiAbsDiff_32f_C1R(src1.ptr<Npp32f>(), src1.step, src2.ptr<Npp32f>(), src2.step, dst.ptr<Npp32f>(), dst.step, sz) );
        break;
    default:
        CV_Assert(!"Unsupported source type");
--- a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
+++ b/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample
@ -0,0 +1,362 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+#define CV_NO_BACKWARD_COMPATIBILITY
+
+#include "opencv2/opencv.hpp"
+
+#include "NCVHaarObjectDetection.hpp"
+
+using namespace cv;
+using namespace std;
+
+const Size preferredVideoFrameSize(640, 480);
+
+string preferredClassifier = "haarcascade_frontalface_alt.xml";
+string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed";
+
+
+void printSyntax(void)
+{
+    printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n");
+}
+
+
+void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...)
+{    
+    int fontFace = CV_FONT_HERSHEY_PLAIN;
+    double fontScale = 1;       
+    
+    int baseline;
+    Size textSize = cv::getTextSize("T", fontFace, fontScale, 1, &baseline);
+
+    va_list arg_ptr;
+    va_start(arg_ptr, format);
+    int len = _vscprintf(format, arg_ptr) + 1;
+    
+    vector<char> strBuf(len);    
+    vsprintf_s(&strBuf[0], len, format, arg_ptr);
+
+    Point org(1, 3 * textSize.height * (lineOffsY + 1) / 2);    
+    putText(img, &strBuf[0], org, fontFace, fontScale, color);
+    va_end(arg_ptr);    
+}
+
+
+NCVStatus process(Mat *srcdst,
+                  Ncv32u width, Ncv32u height,
+                  NcvBool bShowAllHypotheses, NcvBool bLargestFace,
+                  HaarClassifierCascadeDescriptor &haar,
+                  NCVVector<HaarStage64> &d_haarStages, NCVVector<HaarClassifierNode128> &d_haarNodes,
+                  NCVVector<HaarFeature64> &d_haarFeatures, NCVVector<HaarStage64> &h_haarStages,
+                  INCVMemAllocator &gpuAllocator,
+                  INCVMemAllocator &cpuAllocator,
+                  cudaDeviceProp &devProp)
+{
+    ncvAssertReturn(!((srcdst == NULL) ^ gpuAllocator.isCounting()), NCV_NULL_PTR);
+
+    NCVStatus ncvStat;
+
+    NCV_SET_SKIP_COND(gpuAllocator.isCounting());
+
+    NCVMatrixAlloc<Ncv8u> d_src(gpuAllocator, width, height);
+    ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVMatrixAlloc<Ncv8u> h_src(cpuAllocator, width, height);
+    ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+    NCVVectorAlloc<NcvRect32u> d_rects(gpuAllocator, 100);        
+    ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride());
+
+    NCV_SKIP_COND_BEGIN        
+    
+    (*srcdst).copyTo(h_src_hdr);
+    
+    ncvStat = h_src.copySolid(d_src, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    NCV_SKIP_COND_END
+
+    NcvSize32u roi;
+    roi.width = d_src.width();
+    roi.height = d_src.height();
+
+    Ncv32u numDetections;
+    ncvStat = ncvDetectObjectsMultiScale_device(
+        d_src, roi, d_rects, numDetections, haar, h_haarStages,
+        d_haarStages, d_haarNodes, d_haarFeatures,
+        haar.ClassifierSize,
+        bShowAllHypotheses ? 0 : 4,
+        1.2f, 1,
+        (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace,
+        gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    NCV_SKIP_COND_BEGIN
+
+    ncvStat = d_src.copySolid(h_src, 0);
+    ncvAssertReturnNcvStat(ncvStat);
+    ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
+
+    h_src_hdr.copyTo(*srcdst);
+    
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+int main( int argc, const char** argv )
+{
+    NCVStatus ncvStat;
+
+    printf("NVIDIA Computer Vision SDK\n");
+    printf("Face Detection in video and live feed\n");
+    printf("=========================================\n");
+    printf("  Esc   - Quit\n");
+    printf("  Space - Switch between NCV and OpenCV\n");
+    printf("  L     - Switch between FullSearch and LargestFace modes\n");
+    printf("  U     - Toggle unfiltered hypotheses visualization in FullSearch\n");
+    
+    if (argc != 4 && argc != 1)
+        return printSyntax(), -1;
+
+    VideoCapture capture;    
+    Size frameSize;
+
+    if (argc == 1 || strcmp(argv[1], "-c") == 0)
+    {
+        // Camera input is specified
+        int camIdx = (argc == 3) ? atoi(argv[2]) : 0;
+        if(!capture.open(camIdx))        
+            return printf("Error opening camera\n"), -1;        
+            
+        capture.set(CV_CAP_PROP_FRAME_WIDTH, preferredVideoFrameSize.width);
+        capture.set(CV_CAP_PROP_FRAME_HEIGHT, preferredVideoFrameSize.height);
+        capture.set(CV_CAP_PROP_FPS, 25);
+        frameSize = preferredVideoFrameSize;
+    }
+    else if (strcmp(argv[1], "-v") == 0)
+    {
+        // Video file input (avi)
+        if(!capture.open(argv[2]))
+            return printf("Error opening video file\n"), -1;
+
+        frameSize.width  = (int)capture.get(CV_CAP_PROP_FRAME_WIDTH);
+        frameSize.height = (int)capture.get(CV_CAP_PROP_FRAME_HEIGHT);
+    }
+    else
+        return printSyntax(), -1;
+
+    NcvBool bUseOpenCV = true;
+    NcvBool bLargestFace = true;
+    NcvBool bShowAllHypotheses = false;    
+
+    string classifierFile = (argc == 1) ? preferredClassifier : argv[3];
+    
+    CascadeClassifier classifierOpenCV;
+    if (!classifierOpenCV.load(classifierFile))
+        return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1;
+
+    int devId;
+    ncvAssertCUDAReturn(cudaGetDevice(&devId), -1);
+    cudaDeviceProp devProp;
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), -1);
+    printf("Using GPU %d %s, arch=%d.%d\n", devId, devProp.name, devProp.major, devProp.minor);
+
+    //==============================================================================
+    //
+    // Load the classifier from file (assuming its size is about 1 mb)
+    // using a simple allocator
+    //
+    //==============================================================================
+
+    NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice);
+    ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1);
+    NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned);
+    ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1);
+
+    Ncv32u haarNumStages, haarNumNodes, haarNumFeatures;
+    ncvStat = ncvHaarGetClassifierSize(classifierFile, haarNumStages, haarNumNodes, haarNumFeatures);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error reading classifier size (check the file)", -1);
+
+    NCVVectorAlloc<HaarStage64> h_haarStages(cpuCascadeAllocator, haarNumStages);
+    ncvAssertPrintReturn(h_haarStages.isMemAllocated(), "Error in cascade CPU allocator", -1);
+    NCVVectorAlloc<HaarClassifierNode128> h_haarNodes(cpuCascadeAllocator, haarNumNodes);
+    ncvAssertPrintReturn(h_haarNodes.isMemAllocated(), "Error in cascade CPU allocator", -1);
+    NCVVectorAlloc<HaarFeature64> h_haarFeatures(cpuCascadeAllocator, haarNumFeatures);
+    ncvAssertPrintReturn(h_haarFeatures.isMemAllocated(), "Error in cascade CPU allocator", -1);
+
+    HaarClassifierCascadeDescriptor haar;
+    ncvStat = ncvHaarLoadFromFile_host(classifierFile, haar, h_haarStages, h_haarNodes, h_haarFeatures);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error loading classifier", -1);
+
+    NCVVectorAlloc<HaarStage64> d_haarStages(gpuCascadeAllocator, haarNumStages);
+    ncvAssertPrintReturn(d_haarStages.isMemAllocated(), "Error in cascade GPU allocator", -1);
+    NCVVectorAlloc<HaarClassifierNode128> d_haarNodes(gpuCascadeAllocator, haarNumNodes);
+    ncvAssertPrintReturn(d_haarNodes.isMemAllocated(), "Error in cascade GPU allocator", -1);
+    NCVVectorAlloc<HaarFeature64> d_haarFeatures(gpuCascadeAllocator, haarNumFeatures);
+    ncvAssertPrintReturn(d_haarFeatures.isMemAllocated(), "Error in cascade GPU allocator", -1);
+
+    ncvStat = h_haarStages.copySolid(d_haarStages, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+    ncvStat = h_haarNodes.copySolid(d_haarNodes, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+    ncvStat = h_haarFeatures.copySolid(d_haarFeatures, 0);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error copying cascade to GPU", -1);
+
+    //==============================================================================
+    //
+    // Calculate memory requirements and create real allocators
+    //
+    //==============================================================================
+
+    NCVMemStackAllocator gpuCounter(devProp.textureAlignment);
+    ncvAssertPrintReturn(gpuCounter.isInitialized(), "Error creating GPU memory counter", -1);
+    NCVMemStackAllocator cpuCounter(devProp.textureAlignment);
+    ncvAssertPrintReturn(cpuCounter.isInitialized(), "Error creating CPU memory counter", -1);
+
+    ncvStat = process(NULL, frameSize.width, frameSize.height,
+                      false, false, haar,
+                      d_haarStages, d_haarNodes,
+                      d_haarFeatures, h_haarStages,
+                      gpuCounter, cpuCounter, devProp);
+    ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
+
+    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, gpuCounter.maxSize(), devProp.textureAlignment);
+    ncvAssertPrintReturn(gpuAllocator.isInitialized(), "Error creating GPU memory allocator", -1);
+    NCVMemStackAllocator cpuAllocator(NCVMemoryTypeHostPinned, cpuCounter.maxSize(), devProp.textureAlignment);
+    ncvAssertPrintReturn(cpuAllocator.isInitialized(), "Error creating CPU memory allocator", -1);
+
+    printf("Initialized for frame size [%dx%d]\n", frameSize.width, frameSize.height);
+
+    //==============================================================================
+    //
+    // Main processing loop
+    //
+    //==============================================================================
+    
+    namedWindow(wndTitle, 1);
+
+    Mat frame, gray, frameDisp;
+
+    for(;;)
+    {
+        // For camera and video file, capture the next image                
+        capture >> frame;
+        if (frame.empty())
+            break;
+                
+        cvtColor(frame, gray, CV_BGR2GRAY);
+
+        // process
+        NcvSize32u minSize = haar.ClassifierSize;
+        if (bLargestFace)
+        {
+            Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width;
+            Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height;
+            Ncv32u ratioSmallest = std::min(ratioX, ratioY);
+            ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f);
+            minSize.width *= ratioSmallest;
+            minSize.height *= ratioSmallest;
+        }
+        
+        NcvTimer timer = ncvStartTimer();
+
+        if (!bUseOpenCV)
+        {
+            ncvStat = process(&gray, frameSize.width, frameSize.height,
+                              bShowAllHypotheses, bLargestFace, haar,
+                              d_haarStages, d_haarNodes,
+                              d_haarFeatures, h_haarStages,
+                              gpuAllocator, cpuAllocator, devProp);
+            ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "Error in memory counting pass", -1);
+        }
+        else
+        {
+            vector<Rect> rectsOpenCV;
+
+            classifierOpenCV.detectMultiScale(
+                gray,
+                rectsOpenCV,
+                1.2f,
+                bShowAllHypotheses && !bLargestFace ? 0 : 4,
+                (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE,
+                Size(minSize.width, minSize.height));
+
+            for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt)
+                rectangle(gray, rectsOpenCV[rt], Scalar(255));
+        }
+
+        Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer);
+        
+        cvtColor(gray, frameDisp, CV_GRAY2BGR);
+
+        imagePrintf(frameDisp, 0, CV_RGB(255,  0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":"");
+        imagePrintf(frameDisp, 1, CV_RGB(255,  0,0), "L - Switch FullSearch%s / LargestFace%s modes", bLargestFace?"":" (ON)", bLargestFace?" (ON)":"");
+        imagePrintf(frameDisp, 2, CV_RGB(255,  0,0), "U - Toggle unfiltered hypotheses visualization in FullSearch %s", bShowAllHypotheses?"(ON)":"(OFF)");
+        imagePrintf(frameDisp, 3, CV_RGB(118,185,0), "   Running at %f FPS on %s", 1000.0f / avgTime, bUseOpenCV?"CPU":"GPU");
+
+        cv::imshow(wndTitle, frameDisp);
+
+        switch (cvWaitKey(1))
+        {
+        case ' ':
+            bUseOpenCV = !bUseOpenCV;
+            break;
+        case 'L':case 'l':
+            bLargestFace = !bLargestFace;
+            break;
+        case 'U':case 'u':
+            bShowAllHypotheses = !bShowAllHypotheses;
+            break;
+        case 27:
+            return 0;            
+        }
+    }
+        
+    return 0;
+}
--- a/modules/gpu/src/nvidia/NCV.cpp
+++ b/modules/gpu/src/nvidia/NCV.cpp
@ -0,0 +1,571 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#include <precomp.hpp>
+
+
+#if !defined (HAVE_CUDA)
+
+
+#else /* !defined (HAVE_CUDA) */
+
+
+#include <stdarg.h>
+#include "NCV.hpp"
+
+
+//==============================================================================
+//
+// Error handling helpers
+//
+//==============================================================================
+
+
+static void stdioDebugOutput(const char *msg)
+{
+    printf("%s", msg);
+}
+
+
+static NCVDebugOutputHandler *debugOutputHandler = stdioDebugOutput;
+
+
+void ncvDebugOutput(const char *msg, ...)
+{
+    const int K_DEBUG_STRING_MAXLEN = 1024;
+    char buffer[K_DEBUG_STRING_MAXLEN];
+    va_list args;
+    va_start(args, msg);
+    vsnprintf_s(buffer, K_DEBUG_STRING_MAXLEN, K_DEBUG_STRING_MAXLEN-1, msg, args);
+    va_end (args);
+    debugOutputHandler(buffer);
+}
+
+
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
+{
+    debugOutputHandler = func;
+}
+
+
+//==============================================================================
+//
+// Memory wrappers and helpers
+//
+//==============================================================================
+
+
+NCVStatus GPUAlignmentValue(Ncv32u &alignment)
+{
+    int curDev;
+    cudaDeviceProp curProp;
+    ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR);
+    ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR);
+    alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major);
+    return NCV_SUCCESS;
+}
+
+
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
+{
+    Ncv32u alignMask = alignment-1;
+    Ncv32u inverseAlignMask = ~alignMask;
+    Ncv32u res = (what + alignMask) & inverseAlignMask;
+    return res;
+}
+
+
+void NCVMemPtr::clear()
+{
+    ptr = NULL;
+    memtype = NCVMemoryTypeNone;
+}
+
+
+void NCVMemSegment::clear()
+{
+    begin.clear();
+    size = 0;
+}
+
+
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, size_t sz, cudaStream_t cuStream)
+{
+    NCVStatus ncvStat;
+    switch (dstType)
+    {
+    case NCVMemoryTypeHostPageable:
+    case NCVMemoryTypeHostPinned:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            memcpy(dst, src, sz);
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    case NCVMemoryTypeDevice:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    default:
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;
+    }
+
+    return ncvStat;
+}
+
+
+//===================================================================
+//
+// NCVMemStackAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
+    :
+    currentSize(0),
+    _maxSize(0),
+    allocBegin(NULL),
+    begin(NULL),
+    _memType(NCVMemoryTypeNone),
+    _alignment(alignment)
+{
+    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
+}
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment)
+    :
+    currentSize(0),
+    _maxSize(0),
+    allocBegin(NULL),
+    _memType(memT),
+    _alignment(alignment)
+{
+    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
+
+    allocBegin = NULL;
+
+    switch (memT)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
+        break;
+    case NCVMemoryTypeHostPageable:
+        allocBegin = (Ncv8u *)malloc(capacity);
+        break;
+    }
+
+    if (capacity == 0)
+    {
+        allocBegin = (Ncv8u *)(0x1);
+    }
+
+    if (!isCounting())
+    {
+        begin = allocBegin;
+        end = begin + capacity;
+    }
+}
+
+
+NCVMemStackAllocator::~NCVMemStackAllocator()
+{
+    if (allocBegin != NULL)
+    {
+        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
+        switch (_memType)
+        {
+        case NCVMemoryTypeDevice:
+            ncvAssertCUDAReturn(cudaFree(allocBegin), );
+            break;
+        case NCVMemoryTypeHostPinned:
+            ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
+            break;
+        case NCVMemoryTypeHostPageable:
+            free(allocBegin);
+            break;
+        }
+        allocBegin = NULL;
+    }
+}
+
+
+NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    size = alignUp(size, this->_alignment);
+    this->currentSize += size;
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    if (!isCounting())
+    {
+        size_t availSize = end - begin;
+        ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY);
+    }
+
+    seg.begin.ptr = begin;
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+    begin += size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER);
+
+    currentSize -= seg.size;
+    begin -= seg.size;
+
+    seg.clear();
+
+    ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemStackAllocator::isInitialized(void) const
+{
+    return ((this->_alignment & (this->_alignment-1)) == 0) && isCounting() || this->allocBegin != NULL;
+}
+
+
+NcvBool NCVMemStackAllocator::isCounting(void) const
+{
+    return this->_memType == NCVMemoryTypeNone;
+}
+
+
+NCVMemoryType NCVMemStackAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemStackAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemStackAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// NCVMemNativeAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT)
+    :
+    currentSize(0),
+    _maxSize(0),
+    _memType(memT)
+{
+    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
+    ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", );
+}
+
+
+NCVMemNativeAllocator::~NCVMemNativeAllocator()
+{
+    ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak");
+}
+
+
+NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        seg.begin.ptr = (Ncv8u *)malloc(size);
+        break;
+    }
+
+    this->currentSize += alignUp(size, this->_alignment);
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    ncvAssertReturn(currentSize >= alignUp(seg.size, this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
+    currentSize -= alignUp(seg.size, this->_alignment);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        free(seg.begin.ptr);
+        break;
+    }
+
+    seg.clear();
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemNativeAllocator::isInitialized(void) const
+{
+    return (this->_alignment != 0);
+}
+
+
+NcvBool NCVMemNativeAllocator::isCounting(void) const
+{
+    return false;
+}
+
+
+NCVMemoryType NCVMemNativeAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemNativeAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemNativeAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// Time and timer routines
+//
+//===================================================================
+
+
+typedef struct _NcvTimeMoment NcvTimeMoment;
+
+#if defined(_WIN32) || defined(_WIN64)
+
+    #include <Windows.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        LONGLONG moment, freq;
+    } NcvTimeMoment;
+
+
+    static void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        QueryPerformanceFrequency((LARGE_INTEGER *)&(t->freq));
+        QueryPerformanceCounter((LARGE_INTEGER *)&(t->moment));
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->moment / t->freq;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+
+    double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+#elif defined(__unix__)
+
+    #include <sys/time.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        struct timeval tv; 
+        struct timezone tz;
+    } NcvTimeMoment;
+
+
+    void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        gettimeofday(& t->tv, & t->tz);
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->tv.tv_sec + (double)t->tv.tv_usec;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return (((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000000 + (double)t2->tv.tv_usec - (double)t1->tv.tv_usec);
+    }
+
+
+#endif //#if defined(_WIN32) || defined(_WIN64)
+
+
+struct _NcvTimer
+{
+    NcvTimeMoment t1, t2;
+};
+
+
+NcvTimer ncvStartTimer(void)
+{
+    struct _NcvTimer *t;
+    t = (struct _NcvTimer *)malloc(sizeof(struct _NcvTimer));
+    _ncvQueryMoment(&t->t1);
+    return t;
+}
+
+
+double ncvEndQueryTimerUs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMicroseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+
+double ncvEndQueryTimerMs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMilliseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/nvidia/NCV.hpp
+++ b/modules/gpu/src/nvidia/NCV.hpp
@ -0,0 +1,837 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_hpp_
+#define _ncv_hpp_
+
+#include <cuda_runtime.h>
+#include "npp_staging.h"
+
+
+//==============================================================================
+//
+// Alignment macros
+//
+//==============================================================================
+
+
+#if !defined(__align__) && !defined(__CUDACC__)
+    #if defined(_WIN32) || defined(_WIN64)
+        #define __align__(n)         __declspec(align(n))
+    #elif defined(__unix__)
+        #define __align__(n)         __attribute__((__aligned__(n)))
+    #endif
+#endif
+
+
+//==============================================================================
+//
+// Integral and compound types of guaranteed size
+//
+//==============================================================================
+
+
+typedef               bool NcvBool;
+typedef          long long Ncv64s;
+typedef unsigned long long Ncv64u;
+typedef                int Ncv32s;
+typedef       unsigned int Ncv32u;
+typedef              short Ncv16s;
+typedef     unsigned short Ncv16u;
+typedef               char Ncv8s;
+typedef      unsigned char Ncv8u;
+typedef              float Ncv32f;
+typedef             double Ncv64f;
+
+
+typedef struct
+{
+    Ncv8u x;
+    Ncv8u y;
+    Ncv8u width;
+    Ncv8u height;
+} NcvRect8u;
+
+
+typedef struct
+{
+    Ncv32s x;          ///< x-coordinate of upper left corner.
+    Ncv32s y;          ///< y-coordinate of upper left corner.
+    Ncv32s width;      ///< Rectangle width.
+    Ncv32s height;     ///< Rectangle height.
+} NcvRect32s;
+
+
+typedef struct
+{
+    Ncv32u x;          ///< x-coordinate of upper left corner.
+    Ncv32u y;          ///< y-coordinate of upper left corner.
+    Ncv32u width;      ///< Rectangle width.
+    Ncv32u height;     ///< Rectangle height.
+} NcvRect32u;
+
+
+typedef struct 
+{
+    Ncv32s width;  ///< Rectangle width.
+    Ncv32s height; ///< Rectangle height.
+} NcvSize32s;
+
+
+typedef struct 
+{
+    Ncv32u width;  ///< Rectangle width.
+    Ncv32u height; ///< Rectangle height.
+} NcvSize32u;
+
+
+NPPST_CT_ASSERT(sizeof(NcvBool) <= 4);
+NPPST_CT_ASSERT(sizeof(Ncv64s) == 8);
+NPPST_CT_ASSERT(sizeof(Ncv64u) == 8);
+NPPST_CT_ASSERT(sizeof(Ncv32s) == 4);
+NPPST_CT_ASSERT(sizeof(Ncv32u) == 4);
+NPPST_CT_ASSERT(sizeof(Ncv16s) == 2);
+NPPST_CT_ASSERT(sizeof(Ncv16u) == 2);
+NPPST_CT_ASSERT(sizeof(Ncv8s) == 1);
+NPPST_CT_ASSERT(sizeof(Ncv8u) == 1);
+NPPST_CT_ASSERT(sizeof(Ncv32f) == 4);
+NPPST_CT_ASSERT(sizeof(Ncv64f) == 8);
+NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u));
+NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s));
+NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u));
+NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u));
+
+
+//==============================================================================
+//
+// Persistent constants
+//
+//==============================================================================
+
+
+const Ncv32u K_WARP_SIZE = 32;
+const Ncv32u K_LOG2_WARP_SIZE = 5;
+
+
+//==============================================================================
+//
+// Error handling
+//
+//==============================================================================
+
+
+#define NCV_CT_PREP_STRINGIZE_AUX(x)    #x
+#define NCV_CT_PREP_STRINGIZE(x)        NCV_CT_PREP_STRINGIZE_AUX(x)
+
+
+void ncvDebugOutput(const char *msg, ...);
+
+
+typedef void NCVDebugOutputHandler(const char* msg);
+
+
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
+
+
+#define ncvAssertPrintCheck(pred, msg) \
+    ((pred) ? true : (ncvDebugOutput("\n%s\n", \
+    "NCV Assertion Failed: " msg ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__) \
+    ), false))
+
+
+#define ncvAssertPrintReturn(pred, msg, err) \
+    if (ncvAssertPrintCheck(pred, msg)) ; else return err
+
+
+#define ncvAssertReturn(pred, err) \
+    do \
+    { \
+        if (!(pred)) \
+        { \
+            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: retcode=", (int)err, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
+            return err; \
+        } \
+    } while (0)
+
+
+#define ncvAssertReturnNcvStat(ncvOp) \
+    do \
+    { \
+        NCVStatus _ncvStat = ncvOp; \
+        if (NCV_SUCCESS != _ncvStat) \
+        { \
+            ncvDebugOutput("\n%s%d%s\n", "NCV Assertion Failed: NcvStat=", (int)_ncvStat, ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
+            return _ncvStat; \
+        } \
+    } while (0)
+
+
+#define ncvAssertCUDAReturn(cudacall, errCode) \
+    do \
+    { \
+        cudaError_t resCall = cudacall; \
+        cudaError_t resGLE = cudaGetLastError(); \
+        if (cudaSuccess != resCall || cudaSuccess != resGLE) \
+        { \
+            ncvDebugOutput("\n%s%d%s\n", "NCV CUDA Assertion Failed: cudaError_t=", (int)(resCall | resGLE), ", file=" __FILE__ ", line=" NCV_CT_PREP_STRINGIZE(__LINE__)); \
+            return errCode; \
+        } \
+    } while (0)
+
+
+/**
+* Return-codes for status notification, errors and warnings
+*/
+enum NCVStatus
+{
+    NCV_SUCCESS,
+
+    NCV_CUDA_ERROR,
+    NCV_NPP_ERROR,
+    NCV_FILE_ERROR,
+
+    NCV_NULL_PTR,
+    NCV_INCONSISTENT_INPUT,
+    NCV_TEXTURE_BIND_ERROR,
+    NCV_DIMENSIONS_INVALID,
+
+    NCV_INVALID_ROI,
+    NCV_INVALID_STEP,
+    NCV_INVALID_SCALE,
+
+    NCV_ALLOCATOR_NOT_INITIALIZED,
+    NCV_ALLOCATOR_BAD_ALLOC,
+    NCV_ALLOCATOR_BAD_DEALLOC,
+    NCV_ALLOCATOR_INSUFFICIENT_CAPACITY,
+    NCV_ALLOCATOR_DEALLOC_ORDER,
+    NCV_ALLOCATOR_BAD_REUSE,
+
+    NCV_MEM_COPY_ERROR,
+    NCV_MEM_RESIDENCE_ERROR,
+    NCV_MEM_INSUFFICIENT_CAPACITY,
+
+    NCV_HAAR_INVALID_PIXEL_STEP,
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER,
+    NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE,
+    NCV_HAAR_TOO_LARGE_FEATURES,
+    NCV_HAAR_XML_LOADING_EXCEPTION,
+
+    NCV_NOIMPL_HAAR_TILTED_FEATURES,
+
+    NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW,
+};
+
+
+#define NCV_SET_SKIP_COND(x) \
+    bool __ncv_skip_cond = x
+
+
+#define NCV_RESET_SKIP_COND(x) \
+    __ncv_skip_cond = x
+
+
+#define NCV_SKIP_COND_BEGIN \
+    if (!__ncv_skip_cond) {
+
+
+#define NCV_SKIP_COND_END \
+    }
+
+
+//==============================================================================
+//
+// Timer
+//
+//==============================================================================
+
+
+typedef struct _NcvTimer *NcvTimer;
+
+NcvTimer ncvStartTimer(void);
+
+double ncvEndQueryTimerUs(NcvTimer t);
+
+double ncvEndQueryTimerMs(NcvTimer t);
+
+
+//==============================================================================
+//
+// Memory management classes template compound types
+//
+//==============================================================================
+
+
+/**
+* Alignment of GPU memory chunks in bytes
+*/
+NCVStatus GPUAlignmentValue(Ncv32u &alignment);
+
+
+/**
+* Calculates the aligned top bound value
+*/
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment);
+
+
+/**
+* NCVMemoryType
+*/
+enum NCVMemoryType
+{
+    NCVMemoryTypeNone,
+    NCVMemoryTypeHostPageable,
+    NCVMemoryTypeHostPinned,
+    NCVMemoryTypeDevice
+};
+
+
+/**
+* NCVMemPtr
+*/
+struct NCVMemPtr
+{
+    void *ptr;
+    NCVMemoryType memtype;
+    void clear();
+};
+
+
+/**
+* NCVMemSegment
+*/
+struct NCVMemSegment
+{
+    NCVMemPtr begin;
+    size_t size;
+    void clear();
+};
+
+
+/**
+* INCVMemAllocator (Interface)
+*/
+class INCVMemAllocator
+{
+public:
+    virtual ~INCVMemAllocator() = 0;
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size) = 0;
+    virtual NCVStatus dealloc(NCVMemSegment &seg) = 0;
+
+    virtual NcvBool isInitialized(void) const = 0;
+    virtual NcvBool isCounting(void) const = 0;
+    
+    virtual NCVMemoryType memType(void) const = 0;
+    virtual Ncv32u alignment(void) const = 0;
+    virtual size_t maxSize(void) const = 0;
+};
+
+inline INCVMemAllocator::~INCVMemAllocator() {}
+
+
+/**
+* NCVMemStackAllocator
+*/
+class NCVMemStackAllocator : public INCVMemAllocator
+{
+    NCVMemStackAllocator();
+    NCVMemStackAllocator(const NCVMemStackAllocator &);
+
+public:
+
+    explicit NCVMemStackAllocator(Ncv32u alignment);
+    NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment);
+    virtual ~NCVMemStackAllocator();
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
+    virtual NCVStatus dealloc(NCVMemSegment &seg);
+
+    virtual NcvBool isInitialized(void) const;
+    virtual NcvBool isCounting(void) const;
+
+    virtual NCVMemoryType memType(void) const;
+    virtual Ncv32u alignment(void) const;
+    virtual size_t maxSize(void) const;
+
+private:
+
+    NCVMemoryType _memType;
+    Ncv32u _alignment;
+    Ncv8u *allocBegin;
+    Ncv8u *begin;
+    Ncv8u *end;
+    size_t currentSize;
+    size_t _maxSize;
+};
+
+
+/**
+* NCVMemNativeAllocator
+*/
+class NCVMemNativeAllocator : public INCVMemAllocator
+{
+public:
+
+    NCVMemNativeAllocator(NCVMemoryType memT);
+    virtual ~NCVMemNativeAllocator();
+
+    virtual NCVStatus alloc(NCVMemSegment &seg, size_t size);
+    virtual NCVStatus dealloc(NCVMemSegment &seg);
+
+    virtual NcvBool isInitialized(void) const;
+    virtual NcvBool isCounting(void) const;
+
+    virtual NCVMemoryType memType(void) const;
+    virtual Ncv32u alignment(void) const;
+    virtual size_t maxSize(void) const;
+
+private:
+
+    NCVMemNativeAllocator();
+    NCVMemNativeAllocator(const NCVMemNativeAllocator &);
+
+    NCVMemoryType _memType;
+    Ncv32u _alignment;
+    size_t currentSize;
+    size_t _maxSize;
+};
+
+
+/**
+* Copy dispatcher
+*/
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
+                           const void *src, NCVMemoryType srcType,
+                           size_t sz, cudaStream_t cuStream);
+
+
+/**
+* NCVVector (1D)
+*/
+template <class T>
+class NCVVector
+{
+    NCVVector(const NCVVector &);
+
+public:
+
+    NCVVector()
+    {
+        clear();
+    }
+
+	virtual ~NCVVector() {}
+
+    void clear()
+    {
+        _ptr = NULL;
+        _length = 0;
+        _memtype = NCVMemoryTypeNone;
+    }
+
+    NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    {
+        if (howMuch == 0)
+        {
+            ncvAssertReturn(dst._length == this->_length, NCV_MEM_COPY_ERROR);
+            howMuch = this->_length * sizeof(T);
+        }
+        else
+        {
+            ncvAssertReturn(dst._length * sizeof(T) >= howMuch && 
+                this->_length * sizeof(T) >= howMuch &&
+                howMuch > 0, NCV_MEM_COPY_ERROR);
+        }
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && 
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype,
+                                       this->_ptr, this->_memtype,
+                                       howMuch, cuStream);
+        }
+
+        return ncvStat;
+    }
+
+    T *ptr() const {return this->_ptr;}
+    size_t length() const {return this->_length;}
+    NCVMemoryType memType() const {return this->_memtype;}
+
+protected:
+
+    T *_ptr;
+    size_t _length;
+    NCVMemoryType _memtype;
+};
+
+
+/**
+* NCVVectorAlloc
+*/
+template <class T>
+class NCVVectorAlloc : public NCVVector<T>
+{
+    NCVVectorAlloc();
+    NCVVectorAlloc(const NCVVectorAlloc &);
+
+public:
+
+    NCVVectorAlloc(INCVMemAllocator &allocator, Ncv32u length)
+        :
+        allocator(allocator)
+    {
+        NCVStatus ncvStat;
+
+        this->clear();
+        this->allocatedMem.clear();
+
+        ncvStat = allocator.alloc(this->allocatedMem, length * sizeof(T));
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVVectorAlloc ctor:: alloc failed", );
+
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;
+        this->_length = length;
+        this->_memtype = this->allocatedMem.begin.memtype;
+    }
+
+
+    ~NCVVectorAlloc()
+    {
+        NCVStatus ncvStat;
+
+        ncvStat = allocator.dealloc(this->allocatedMem);
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVVectorAlloc dtor:: dealloc failed");
+
+        this->clear();
+    }
+
+
+    NcvBool isMemAllocated() const
+    {
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
+    }
+
+
+    Ncv32u getAllocatorsAlignment() const
+    {
+        return allocator.alignment();
+    }
+
+
+    NCVMemSegment getSegment() const
+    {
+        return allocatedMem;
+    }
+
+private:
+
+    INCVMemAllocator &allocator;
+    NCVMemSegment allocatedMem;
+};
+
+
+/**
+* NCVVectorReuse
+*/
+template <class T>
+class NCVVectorReuse : public NCVVector<T>
+{
+    NCVVectorReuse();
+    NCVVectorReuse(const NCVVectorReuse &);
+
+public:
+
+    explicit NCVVectorReuse(const NCVMemSegment &memSegment)
+    {
+        this->bReused = false;
+        this->clear();
+
+        this->_length = memSegment.size / sizeof(T);
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+
+    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
+    {
+        this->bReused = false;
+        this->clear();
+
+        ncvAssertPrintReturn(length * sizeof(T) <= memSegment.size, \
+            "NCVVectorReuse ctor:: memory binding failed due to size mismatch", );
+
+        this->_length = length;
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+
+    NcvBool isMemReused() const
+    {
+        return this->bReused;
+    }
+
+private:
+
+    NcvBool bReused;
+};
+
+
+/**
+* NCVMatrix (2D)
+*/
+template <class T>
+class NCVMatrix
+{
+    NCVMatrix(const NCVMatrix &);
+
+public:
+
+    NCVMatrix()
+    {
+        clear();
+    }
+
+	virtual ~NCVMatrix() {}
+
+
+    void clear()
+    {
+        _ptr = NULL;
+        _pitch = 0;
+        _width = 0;
+        _height = 0;
+        _memtype = NCVMemoryTypeNone;
+    }
+
+
+    Ncv32u stride() const
+    {
+        return _pitch / sizeof(T);
+    }
+
+
+    NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
+    {
+        if (howMuch == 0)
+        {
+            ncvAssertReturn(dst._pitch == this->_pitch &&
+                            dst._height == this->_height, NCV_MEM_COPY_ERROR);
+            howMuch = this->_pitch * this->_height;
+        }
+        else
+        {
+            ncvAssertReturn(dst._pitch * dst._height >= howMuch && 
+                            this->_pitch * this->_height >= howMuch &&
+                            howMuch > 0, NCV_MEM_COPY_ERROR);
+        }
+        ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) && 
+                        (dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
+
+        NCVStatus ncvStat = NCV_SUCCESS;
+        if (this->_memtype != NCVMemoryTypeNone)
+        {
+            ncvStat = memSegCopyHelper(dst._ptr, dst._memtype, 
+                                       this->_ptr, this->_memtype, 
+                                       howMuch, cuStream);
+        }
+
+        return ncvStat;
+    }
+
+    T *ptr() const {return this->_ptr;}
+    Ncv32u width() const {return this->_width;}
+    Ncv32u height() const {return this->_height;}
+    Ncv32u pitch() const {return this->_pitch;}
+    NCVMemoryType memType() const {return this->_memtype;}
+
+protected:
+
+    T *_ptr;
+    Ncv32u _width;
+    Ncv32u _height;
+    Ncv32u _pitch;
+    NCVMemoryType _memtype;
+};
+
+
+/**
+* NCVMatrixAlloc
+*/
+template <class T>
+class NCVMatrixAlloc : public NCVMatrix<T>
+{
+    NCVMatrixAlloc();
+    NCVMatrixAlloc(const NCVMatrixAlloc &);
+
+public:
+
+    NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0)
+        :
+        allocator(allocator)
+    {
+        NCVStatus ncvStat;
+
+        this->clear();
+        this->allocatedMem.clear();
+
+        Ncv32u widthBytes = width * sizeof(T);
+        Ncv32u pitchBytes = alignUp(widthBytes, allocator.alignment());
+
+        if (pitch != 0)
+        {
+            ncvAssertPrintReturn(pitch >= pitchBytes &&
+                (pitch & (allocator.alignment() - 1)) == 0,
+                "NCVMatrixAlloc ctor:: incorrect pitch passed", );
+            pitchBytes = pitch;
+        }
+
+        Ncv32u requiredAllocSize = pitchBytes * height;
+
+        ncvStat = allocator.alloc(this->allocatedMem, requiredAllocSize);
+        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc ctor:: alloc failed", );
+
+        this->_ptr = (T *)this->allocatedMem.begin.ptr;
+        this->_width = width;
+        this->_height = height;
+        this->_pitch = pitchBytes;
+        this->_memtype = this->allocatedMem.begin.memtype;
+    }
+
+    ~NCVMatrixAlloc()
+    {
+        NCVStatus ncvStat;
+
+        ncvStat = allocator.dealloc(this->allocatedMem);
+        ncvAssertPrintCheck(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc dtor:: dealloc failed");
+
+        this->clear();
+    }
+
+
+    NcvBool isMemAllocated() const
+    {
+        return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
+    }
+
+
+    Ncv32u getAllocatorsAlignment() const
+    {
+        return allocator.alignment();
+    }
+
+
+    NCVMemSegment getSegment() const
+    {
+        return allocatedMem;
+    }
+
+private:
+
+    INCVMemAllocator &allocator;
+    NCVMemSegment allocatedMem;
+};
+
+
+/**
+* NCVMatrixReuse
+*/
+template <class T>
+class NCVMatrixReuse : public NCVMatrix<T>
+{
+    NCVMatrixReuse();
+    NCVMatrixReuse(const NCVMatrixReuse &);
+
+public:
+
+    NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width, Ncv32u height, Ncv32u pitch=0, NcvBool bSkipPitchCheck=false)
+    {
+        this->bReused = false;
+        this->clear();
+
+        Ncv32u widthBytes = width * sizeof(T);
+        Ncv32u pitchBytes = alignUp(widthBytes, alignment);
+
+        if (pitch != 0)
+        {
+            if (!bSkipPitchCheck)
+            {
+                ncvAssertPrintReturn(pitch >= pitchBytes &&
+                    (pitch & (alignment - 1)) == 0,
+                    "NCVMatrixReuse ctor:: incorrect pitch passed", );
+            }
+            else
+            {
+                ncvAssertPrintReturn(pitch >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", );
+            }
+            pitchBytes = pitch;
+        }
+
+        ncvAssertPrintReturn(pitchBytes * height <= memSegment.size, \
+            "NCVMatrixReuse ctor:: memory binding failed due to size mismatch", );
+
+        this->_width = width;
+        this->_height = height;
+        this->_pitch = pitchBytes;
+        this->_ptr = (T *)memSegment.begin.ptr;
+        this->_memtype = memSegment.begin.memtype;
+
+        this->bReused = true;
+    }
+
+
+    NcvBool isMemReused() const
+    {
+        return this->bReused;
+    }
+
+private:
+
+    NcvBool bReused;
+};
+
+#endif // _ncv_hpp_
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp
@ -0,0 +1,501 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// 
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
+//
+// The algorithm and code are explained in the upcoming GPU Computing Gems
+// chapter in detail:
+//
+//   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
+//   PDF URL placeholder
+//   email: aobukhov@nvidia.com, devsupport@nvidia.com
+//
+// Credits for help with the code to:
+// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ncvhaarobjectdetection_hpp_
+#define _ncvhaarobjectdetection_hpp_
+
+#include <string>
+#include "NCV.hpp"
+
+
+//==============================================================================
+//
+// Guaranteed size cross-platform classifier structures
+//
+//==============================================================================
+
+
+struct HaarFeature64
+{
+    uint2 _ui2;
+
+#define HaarFeature64_CreateCheck_MaxRectField                  0xFF
+
+    __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight)
+    {
+        ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES);
+        ((NcvRect8u*)&(this->_ui2.x))->x = rectX;
+        ((NcvRect8u*)&(this->_ui2.x))->y = rectY;
+        ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth;
+        ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setWeight(Ncv32f weight)
+    {
+        ((Ncv32f*)&(this->_ui2.y))[0] = weight;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ void getRect(Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
+    {
+        NcvRect8u tmpRect = *(NcvRect8u*)(&this->_ui2.x);
+        *rectX = tmpRect.x;
+        *rectY = tmpRect.y;
+        *rectWidth = tmpRect.width;
+        *rectHeight = tmpRect.height;
+    }
+
+    __device__ __host__ Ncv32f getWeight(void)
+    {
+        return *(Ncv32f*)(&this->_ui2.y);
+    }
+};
+
+
+struct HaarFeatureDescriptor32
+{
+private:
+
+#define HaarFeatureDescriptor32_Interpret_MaskFlagTilted        0x80000000
+#define HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures      0x7F
+#define HaarFeatureDescriptor32_NumFeatures_Shift               24
+#define HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset    0x00FFFFFF
+
+    Ncv32u desc;
+
+public:
+
+    __host__ NCVStatus create(NcvBool bTilted, Ncv32u numFeatures, Ncv32u offsetFeatures)
+    {
+        if (numFeatures > HaarFeatureDescriptor32_CreateCheck_MaxNumFeatures)
+        {
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER;
+        }
+        if (offsetFeatures > HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset)
+        {
+            return NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE;
+        }
+        this->desc = 0;
+        this->desc |= (bTilted ? HaarFeatureDescriptor32_Interpret_MaskFlagTilted : 0);
+        this->desc |= (numFeatures << HaarFeatureDescriptor32_NumFeatures_Shift);
+        this->desc |= offsetFeatures;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ NcvBool isTilted(void)
+    {
+        return (this->desc & HaarFeatureDescriptor32_Interpret_MaskFlagTilted) != 0;
+    }
+
+    __device__ __host__ Ncv32u getNumFeatures(void)
+    {
+        return (this->desc & ~HaarFeatureDescriptor32_Interpret_MaskFlagTilted) >> HaarFeatureDescriptor32_NumFeatures_Shift;
+    }
+
+    __device__ __host__ Ncv32u getFeaturesOffset(void)
+    {
+        return this->desc & HaarFeatureDescriptor32_CreateCheck_MaxFeatureOffset;
+    }
+};
+
+
+struct HaarClassifierNodeDescriptor32
+{
+    uint1 _ui1;
+
+#define HaarClassifierNodeDescriptor32_Interpret_MaskSwitch     (1 << 30)
+
+    __host__ NCVStatus create(Ncv32f leafValue)
+    {
+        if ((*(Ncv32u *)&leafValue) & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        *(Ncv32f *)&this->_ui1 = leafValue;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus create(Ncv32u offsetHaarClassifierNode)
+    {
+        if (offsetHaarClassifierNode >= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch)
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui1.x = offsetHaarClassifierNode;
+        this->_ui1.x |= HaarClassifierNodeDescriptor32_Interpret_MaskSwitch;
+        return NCV_SUCCESS;
+    }
+
+    __device__ __host__ NcvBool isLeaf(void)
+    {
+        return !(this->_ui1.x & HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);
+    }
+
+    __host__ Ncv32f getLeafValueHost(void)
+    {
+        return *(Ncv32f *)&this->_ui1.x;
+    }
+
+#ifdef __CUDACC__
+    __device__ Ncv32f getLeafValue(void)
+    {
+        return __int_as_float(this->_ui1.x);
+    }
+#endif
+
+    __device__ __host__ Ncv32u getNextNodeOffset(void)
+    {
+        return (this->_ui1.x & ~HaarClassifierNodeDescriptor32_Interpret_MaskSwitch);
+    }
+};
+
+
+struct HaarClassifierNode128
+{
+    uint4 _ui4;
+
+    __host__ NCVStatus setFeatureDesc(HaarFeatureDescriptor32 f)
+    {
+        this->_ui4.x = *(Ncv32u *)&f;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setThreshold(Ncv32f t)
+    {
+        this->_ui4.y = *(Ncv32u *)&t;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setLeftNodeDesc(HaarClassifierNodeDescriptor32 nl)
+    {
+        this->_ui4.z = *(Ncv32u *)&nl;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setRightNodeDesc(HaarClassifierNodeDescriptor32 nr)
+    {
+        this->_ui4.w = *(Ncv32u *)&nr;
+        return NCV_SUCCESS;
+    }
+
+    __host__ __device__ HaarFeatureDescriptor32 getFeatureDesc(void)
+    {
+        return *(HaarFeatureDescriptor32 *)&this->_ui4.x;
+    }
+
+    __host__ __device__ Ncv32f getThreshold(void)
+    {
+        return *(Ncv32f*)&this->_ui4.y;
+    }
+
+    __host__ __device__ HaarClassifierNodeDescriptor32 getLeftNodeDesc(void)
+    {
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.z;
+    }
+
+    __host__ __device__ HaarClassifierNodeDescriptor32 getRightNodeDesc(void)
+    {
+        return *(HaarClassifierNodeDescriptor32 *)&this->_ui4.w;
+    }
+};
+
+
+struct HaarStage64
+{
+#define HaarStage64_Interpret_MaskRootNodes         0x0000FFFF
+#define HaarStage64_Interpret_MaskRootNodeOffset    0xFFFF0000
+#define HaarStage64_Interpret_ShiftRootNodeOffset   16
+
+    uint2 _ui2;
+
+    __host__ NCVStatus setStageThreshold(Ncv32f t)
+    {
+        this->_ui2.x = *(Ncv32u *)&t;
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setStartClassifierRootNodeOffset(Ncv32u val)
+    {
+        if (val > (HaarStage64_Interpret_MaskRootNodeOffset >> HaarStage64_Interpret_ShiftRootNodeOffset))
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui2.y = (val << HaarStage64_Interpret_ShiftRootNodeOffset) | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);
+        return NCV_SUCCESS;
+    }
+
+    __host__ NCVStatus setNumClassifierRootNodes(Ncv32u val)
+    {
+        if (val > HaarStage64_Interpret_MaskRootNodes)
+        {
+            return NCV_HAAR_XML_LOADING_EXCEPTION;
+        }
+        this->_ui2.y = val | (this->_ui2.y & HaarStage64_Interpret_MaskRootNodeOffset);
+        return NCV_SUCCESS;
+    }
+
+    __host__ __device__ Ncv32f getStageThreshold(void)
+    {
+        return *(Ncv32f*)&this->_ui2.x;
+    }
+
+    __host__ __device__ Ncv32u getStartClassifierRootNodeOffset(void)
+    {
+        return (this->_ui2.y >> HaarStage64_Interpret_ShiftRootNodeOffset);
+    }
+
+    __host__ __device__ Ncv32u getNumClassifierRootNodes(void)
+    {
+        return (this->_ui2.y & HaarStage64_Interpret_MaskRootNodes);
+    }
+};
+
+
+NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8);
+NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4);
+NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4);
+NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16);
+NPPST_CT_ASSERT(sizeof(HaarStage64) == 8);
+
+
+//==============================================================================
+//
+// Classifier cascade descriptor
+//
+//==============================================================================
+
+
+struct HaarClassifierCascadeDescriptor
+{
+    Ncv32u NumStages;
+    Ncv32u NumClassifierRootNodes;
+    Ncv32u NumClassifierTotalNodes;
+    Ncv32u NumFeatures;
+    NcvSize32u ClassifierSize;
+    NcvBool bNeedsTiltedII;
+    NcvBool bHasStumpsOnly;
+};
+
+
+//==============================================================================
+//
+// Functional interface
+//
+//==============================================================================
+
+
+enum
+{
+    NCVPipeObjDet_Default               = 0x000,
+    NCVPipeObjDet_UseFairImageScaling   = 0x001,
+    NCVPipeObjDet_FindLargestObject     = 0x002,
+    NCVPipeObjDet_VisualizeInPlace      = 0x004,
+};
+
+
+NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
+                                            NcvSize32u srcRoi,
+                                            NCVVector<NcvRect32u> &d_dstRects,
+                                            Ncv32u &dstNumRects,
+
+                                            HaarClassifierCascadeDescriptor &haar,
+                                            NCVVector<HaarStage64> &h_HaarStages,
+                                            NCVVector<HaarStage64> &d_HaarStages,
+                                            NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                            NCVVector<HaarFeature64> &d_HaarFeatures,
+
+                                            NcvSize32u minObjSize,
+                                            Ncv32u minNeighbors,      //default 4
+                                            Ncv32f scaleStep,         //default 1.2f
+                                            Ncv32u pixelStep,         //default 1
+                                            Ncv32u flags,             //default NCVPipeObjDet_Default
+
+                                            INCVMemAllocator &gpuAllocator,
+                                            INCVMemAllocator &cpuAllocator,
+                                            Ncv32u devPropMajor,
+                                            Ncv32u devPropMinor,
+                                            cudaStream_t cuStream);
+
+
+#define OBJDET_MASK_ELEMENT_INVALID_32U     0xFFFFFFFF
+#define HAAR_STDDEV_BORDER                  1
+
+
+NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
+                                               NCVMatrix<Ncv32f> &d_weights,
+                                               NCVMatrixAlloc<Ncv32u> &d_pixelMask,
+                                               Ncv32u &numDetections,
+                                               HaarClassifierCascadeDescriptor &haar,
+                                               NCVVector<HaarStage64> &h_HaarStages,
+                                               NCVVector<HaarStage64> &d_HaarStages,
+                                               NCVVector<HaarClassifierNode128> &d_HaarNodes,
+                                               NCVVector<HaarFeature64> &d_HaarFeatures,
+                                               NcvBool bMaskElements,
+                                               NcvSize32u anchorsRoi,
+                                               Ncv32u pixelStep,
+                                               Ncv32f scaleArea,
+                                               INCVMemAllocator &gpuAllocator,
+                                               INCVMemAllocator &cpuAllocator,
+                                               Ncv32u devPropMajor,
+                                               Ncv32u devPropMinor,
+                                               cudaStream_t cuStream);
+
+
+NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
+                                             NCVMatrix<Ncv32f> &h_weights,
+                                             NCVMatrixAlloc<Ncv32u> &h_pixelMask,
+                                             Ncv32u &numDetections,
+                                             HaarClassifierCascadeDescriptor &haar,
+                                             NCVVector<HaarStage64> &h_HaarStages,
+                                             NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                             NCVVector<HaarFeature64> &h_HaarFeatures,
+                                             NcvBool bMaskElements,
+                                             NcvSize32u anchorsRoi,
+                                             Ncv32u pixelStep,
+                                             Ncv32f scaleArea);
+
+
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
+                                 Ncv32u dstStride,
+                                 Ncv32u dstWidth,
+                                 Ncv32u dstHeight,
+                                 NcvRect32u *d_rects,
+                                 Ncv32u numRects,
+                                 Ncv8u color,
+                                 cudaStream_t cuStream);
+
+
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
+                                  Ncv32u dstStride,
+                                  Ncv32u dstWidth,
+                                  Ncv32u dstHeight,
+                                  NcvRect32u *d_rects,
+                                  Ncv32u numRects,
+                                  Ncv32u color,
+                                  cudaStream_t cuStream);
+
+
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
+                               Ncv32u dstStride,
+                               Ncv32u dstWidth,
+                               Ncv32u dstHeight,
+                               NcvRect32u *h_rects,
+                               Ncv32u numRects,
+                               Ncv8u color);
+
+
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
+                                Ncv32u dstStride,
+                                Ncv32u dstWidth,
+                                Ncv32u dstHeight,
+                                NcvRect32u *h_rects,
+                                Ncv32u numRects,
+                                Ncv32u color);
+
+
+#define RECT_SIMILARITY_PROPORTION      0.2f
+
+
+NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
+                                         Ncv32u numPixelMaskDetections,
+                                         NCVVector<NcvRect32u> &hypotheses,
+                                         Ncv32u &totalDetections,
+                                         Ncv32u totalMaxDetections,
+                                         Ncv32u rectWidth,
+                                         Ncv32u rectHeight,
+                                         Ncv32f curScale,
+                                         cudaStream_t cuStream);
+
+
+NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
+                                       Ncv32u numPixelMaskDetections,
+                                       NCVVector<NcvRect32u> &hypotheses,
+                                       Ncv32u &totalDetections,
+                                       Ncv32u totalMaxDetections,
+                                       Ncv32u rectWidth,
+                                       Ncv32u rectHeight,
+                                       Ncv32f curScale);
+
+
+NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
+                                   Ncv32u &numHypotheses,
+                                   Ncv32u minNeighbors,
+                                   Ncv32f intersectEps,
+                                   NCVVector<Ncv32u> *hypothesesWeights);
+
+
+NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
+                                   Ncv32u &numNodes, Ncv32u &numFeatures);
+
+
+NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
+                                   HaarClassifierCascadeDescriptor &haar,
+                                   NCVVector<HaarStage64> &h_HaarStages,
+                                   NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                   NCVVector<HaarFeature64> &h_HaarFeatures);
+
+
+NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
+                                 HaarClassifierCascadeDescriptor haar,
+                                 NCVVector<HaarStage64> &h_HaarStages,
+                                 NCVVector<HaarClassifierNode128> &h_HaarNodes,
+                                 NCVVector<HaarFeature64> &h_HaarFeatures);
+
+
+
+#endif // _ncvhaarobjectdetection_hpp_
--- a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
+++ b/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp
@ -0,0 +1,174 @@
+////////////////////////////////////////////////////////////////////////////////
+// The Loki Library
+// Copyright (c) 2001 by Andrei Alexandrescu
+// This code accompanies the book:
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design 
+//     Patterns Applied". Copyright (c) 2001. Addison-Wesley.
+// Permission to use, copy, modify, distribute and sell this software for any 
+//     purpose is hereby granted without fee, provided that the above copyright 
+//     notice appear in all copies and that both that copyright notice and this 
+//     permission notice appear in supporting documentation.
+// The author or Addison-Welsey Longman make no representations about the 
+//     suitability of this software for any purpose. It is provided "as is" 
+//     without express or implied warranty.
+// http://loki-lib.sourceforge.net/index.php?n=Main.License
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+
+#include <stdarg.h>
+#include <vector>
+
+
+namespace Loki
+{
+    //==============================================================================
+    // class NullType
+    // Used as a placeholder for "no type here"
+    // Useful as an end marker in typelists 
+    //==============================================================================
+
+    class NullType {};
+
+    //==============================================================================
+    // class template Typelist
+    // The building block of typelists of any length
+    // Use it through the LOKI_TYPELIST_NN macros
+    // Defines nested types:
+    //     Head (first element, a non-typelist type by convention)
+    //     Tail (second element, can be another typelist)
+    //==============================================================================
+
+    template <class T, class U>
+    struct Typelist
+    {
+        typedef T Head;
+        typedef U Tail;
+    };
+
+    //==============================================================================
+    // class template Int2Type
+    // Converts each integral constant into a unique type
+    // Invocation: Int2Type<v> where v is a compile-time constant integral
+    // Defines 'value', an enum that evaluates to v
+    //==============================================================================
+
+    template <int v>
+    struct Int2Type
+    {
+        enum { value = v };
+    };
+
+    namespace TL
+    {
+        //==============================================================================
+        // class template TypeAt
+        // Finds the type at a given index in a typelist
+        // Invocation (TList is a typelist and index is a compile-time integral 
+        //     constant):
+        // TypeAt<TList, index>::Result
+        // returns the type in position 'index' in TList
+        // If you pass an out-of-bounds index, the result is a compile-time error
+        //==============================================================================
+
+        template <class TList, unsigned int index> struct TypeAt;
+
+        template <class Head, class Tail>
+        struct TypeAt<Typelist<Head, Tail>, 0>
+        {
+            typedef Head Result;
+        };
+
+        template <class Head, class Tail, unsigned int i>
+        struct TypeAt<Typelist<Head, Tail>, i>
+        {
+            typedef typename TypeAt<Tail, i - 1>::Result Result;
+        };
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime boolean template instance dispatcher
+// Cyril Crassin <cyril.crassin@icare3d.org>
+// NVIDIA, 2010
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NCVRuntimeTemplateBool
+{
+    //This struct is used to transform a list of parameters into template arguments
+    //The idea is to build a typelist containing the arguments
+    //and to pass this typelist to a user defined functor
+    template<typename TList, int NumArguments, class Func>
+    struct KernelCaller
+    {
+        //Convenience function used by the user
+        //Takes a variable argument list, transforms it into a list
+        static void call(Func &functor, int dummy, ...)
+        {
+            //Vector used to collect arguments
+            std::vector<int> templateParamList;
+
+            //Variable argument list manipulation
+            va_list listPointer;
+            va_start(listPointer, dummy);
+            //Collect parameters into the list
+            for(int i=0; i<NumArguments; i++)
+            {
+                int val = va_arg(listPointer, int);
+                templateParamList.push_back(val);
+            }
+            va_end(listPointer);
+
+            //Call the actual typelist building function
+            call(functor, templateParamList);
+        }
+
+        //Actual function called recursively to build a typelist based
+        //on a list of values
+        static void call( Func &functor, std::vector<int> &templateParamList)
+        {
+            //Get current parameter value in the list
+            int val = templateParamList[templateParamList.size() - 1];
+            templateParamList.pop_back();
+
+            //Select the compile time value to add into the typelist
+            //depending on the runtime variable and make recursive call. 
+            //Both versions are really instantiated
+            if(val)
+            {
+                KernelCaller<
+                    Loki::Typelist<typename Loki::Int2Type<true>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+            else
+            {
+                KernelCaller< 
+                    Loki::Typelist<typename Loki::Int2Type<false>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+        }
+    };
+
+    //Specialization for 0 value left in the list
+    //-> actual kernel functor call
+    template<class TList, class Func>
+    struct KernelCaller<TList, 0, Func>
+    {
+        static void call(Func &functor)
+        {
+            //Call to the functor's kernel call method
+            functor.call(TList()); //TList instantiated to get the method template parameter resolved
+        }
+
+        static void call(Func &functor, std::vector<int> &templateParams)
+        {
+            functor.call(TList());
+        }
+    };
+}
+
+#endif //_ncvruntimetemplates_hpp_
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@ -71,6 +71,9 @@
    #include "npp_staging.h"
    #include "surf_key_point.h"

+	#include "nvidia/NCV.hpp"
+	#include "nvidia/NCVHaarObjectDetection.hpp"
+
 #define CUDART_MINIMUM_REQUIRED_VERSION 3020
 #define NPP_MINIMUM_REQUIRED_VERSION 3216

--- a/samples/gpu/cascadeclassifier.cpp
+++ b/samples/gpu/cascadeclassifier.cpp
@ -0,0 +1,193 @@
+// WARNING: this sample is under construction! Use it on your own risk.
+
+#include <opencv2/contrib/contrib.hpp>
+#include <opencv2/objdetect/objdetect.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/gpu/gpu.hpp>
+
+#include <iostream>
+#include <iomanip>
+#include <stdio.h>
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+void help()
+{
+    cout << "Usage: ./cascadeclassifier <cascade_file> <image_or_video_or_cameraid>\n"               
+            "Using OpenCV version " << CV_VERSION << endl << endl;
+}
+
+void DetectAndDraw(Mat& img, CascadeClassifier_GPU& cascade);
+
+String cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml";
+String nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml";
+
+
+
+template<class T> void convertAndReseize(const T& src, T& gray, T& resized, double scale = 2.0)
+{
+    if (src.channels() == 3)
+        cvtColor( src, gray, CV_BGR2GRAY );
+    else
+        gray = src;
+
+    Size sz(cvRound(gray.cols * scale), cvRound(gray.rows * scale));
+    if (scale != 1)
+        resize(gray, resized, sz);
+    else
+        resized = gray;
+}
+
+
+
+int main( int argc, const char** argv )
+{        
+    if (argc != 3)
+        return help(), -1;
+
+    if (cv::gpu::getCudaEnabledDeviceCount() == 0)
+        return cerr << "No GPU found or the library is compiled without GPU support" << endl, -1;
+
+    VideoCapture capture;
+     
+    string cascadeName = argv[1];
+    string inputName = argv[2];
+
+    cv::gpu::CascadeClassifier_GPU cascade_gpu;
+    if( !cascade_gpu.load( cascadeName ) )
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
+
+    cv::CascadeClassifier cascade_cpu;
+    if( !cascade_cpu.load( cascadeName ) )
+        return cerr << "ERROR: Could not load cascade classifier \"" << cascadeName << "\"" << endl, help(), -1;
+    
+    Mat image = imread( inputName);
+    if( image.empty() )
+        if (!capture.open(inputName))
+        {
+            int camid = 0;
+            sscanf(inputName.c_str(), "%d", &camid);
+            if(!capture.open(camid))
+                cout << "Can't open source" << endl;
+        }
+    
+    namedWindow( "result", 1 );        
+    Size fontSz = cv::getTextSize("T[]", FONT_HERSHEY_SIMPLEX, 1.0, 2, 0);
+
+    Mat frame, frame_cpu, gray_cpu, resized_cpu, faces_downloaded, frameDisp;
+    vector<Rect> facesBuf_cpu;
+
+    GpuMat frame_gpu, gray_gpu, resized_gpu, facesBuf_gpu;    
+    
+    /* parameters */
+    bool useGPU = true;
+    double scale_factor = 2;
+
+    bool visualizeInPlace = false;   
+    bool findLargestObject = false;    
+
+    printf("\t<space> - toggle GPU/CPU\n");
+    printf("\tL       - toggle lagest faces\n");
+    printf("\tV       - toggle visualisation in-place (for GPU only)\n");
+    printf("\t1/q     - inc/dec scale\n");
+        
+    int detections_num;
+    for(;;)
+    {               
+        if( capture.isOpened() )
+        {
+            capture >> frame;                            
+            if( frame.empty())
+                break;
+        }
+
+        (image.empty() ? frame : image).copyTo(frame_cpu);
+        frame_gpu.upload( image.empty() ? frame : image);
+        
+        convertAndReseize(frame_gpu, gray_gpu, resized_gpu, scale_factor);
+        convertAndReseize(frame_cpu, gray_cpu, resized_cpu, scale_factor);
+
+        cv::TickMeter tm;
+        tm.start();      
+
+        if (useGPU)
+        {
+            cascade_gpu.visualizeInPlace = visualizeInPlace;   
+            cascade_gpu.findLargestObject = findLargestObject;    
+
+            detections_num = cascade_gpu.detectMultiScale( resized_gpu, facesBuf_gpu ); 
+            facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
+        
+        }
+        else /* so use CPU */
+        {   
+            Size minSize = cascade_gpu.getClassifierSize();
+            if (findLargestObject)
+            {                
+                float ratio = (float)std::min(frame.cols / minSize.width, frame.rows / minSize.height);
+                ratio = std::max(ratio / 2.5f, 1.f);
+                minSize = Size(cvRound(minSize.width * ratio), cvRound(minSize.height * ratio));                
+            }
+            
+            cascade_cpu.detectMultiScale(resized_cpu, facesBuf_cpu, 1.2, 4, (findLargestObject ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE, minSize);                            
+            detections_num = (int)facesBuf_cpu.size();
+        }
+
+        tm.stop();
+        printf( "detection time = %g ms\n", tm.getTimeMilli() );
+
+        if (useGPU)
+            resized_gpu.download(resized_cpu);
+
+        if (!visualizeInPlace || !useGPU)
+            if (detections_num)
+            {
+                Rect* faces = useGPU ? faces_downloaded.ptr<Rect>() : &facesBuf_cpu[0];                
+                for(int i = 0; i < detections_num; ++i)                
+                    cv::rectangle(resized_cpu, faces[i], Scalar(255));            
+            }
+        
+        Point text_pos(5, 25);
+        int offs = fontSz.height + 5;
+        Scalar color = CV_RGB(255, 0, 0);
+
+
+        cv::cvtColor(resized_cpu, frameDisp, CV_GRAY2BGR);
+
+        char buf[4096];
+        sprintf(buf, "%s, FPS = %0.3g", useGPU ? "GPU" : "CPU", 1.0/tm.getTimeSec());                       
+        putText(frameDisp, buf, text_pos, FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+        sprintf(buf, "scale = %0.3g, [%d*scale x %d*scale]", scale_factor, frame.cols, frame.rows);                       
+        putText(frameDisp, buf, text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+        putText(frameDisp, "Hotkeys: space, 1, Q, L, V, Esc", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+
+        if (findLargestObject)
+            putText(frameDisp, "FindLargestObject", text_pos+=Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+
+        if (visualizeInPlace && useGPU)
+            putText(frameDisp, "VisualizeInPlace", text_pos+Point(0,offs), FONT_HERSHEY_SIMPLEX, 1.0, color, 2);
+
+        cv::imshow( "result", frameDisp);
+
+        int key = waitKey( 5 );
+        if( key == 27)
+            break;
+
+        switch (key)
+        {
+        case (int)' ':  useGPU = !useGPU;  printf("Using %s\n", useGPU ? "GPU" : "CPU");break;
+        case (int)'v':  case (int)'V': visualizeInPlace = !visualizeInPlace; printf("VisualizeInPlace = %d\n", visualizeInPlace); break;
+        case (int)'l':  case (int)'L': findLargestObject = !findLargestObject;  printf("FindLargestObject = %d\n", findLargestObject); break;
+        case (int)'1':  scale_factor*=1.05; printf("Scale factor = %g\n", scale_factor); break;
+        case (int)'q':  case (int)'Q':scale_factor/=1.05; printf("Scale factor = %g\n", scale_factor); break;
+        }
+       
+    }    
+    return 0;
+}
+
+
+