renamed gpunvidia -> gpulegacy

2013-04-18 10:14:43 +04:00
parent 7e91e1871d
commit 508fb6aa5b
62 changed files with 69 additions and 60 deletions
--- a/modules/gpulegacy/src/NCV.cpp
+++ b/modules/gpulegacy/src/NCV.cpp
@@ -0,0 +1,888 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+//==============================================================================
+//
+// Error handling helpers
+//
+//==============================================================================
+
+namespace
+{
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        const char* str;
+    };
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {}
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    //////////////////////////////////////////////////////////////////////////
+    // NCV errors
+
+    const ErrorEntry ncv_errors [] =
+    {
+        error_entry( NCV_SUCCESS ),
+        error_entry( NCV_UNKNOWN_ERROR ),
+        error_entry( NCV_CUDA_ERROR ),
+        error_entry( NCV_NPP_ERROR ),
+        error_entry( NCV_FILE_ERROR ),
+        error_entry( NCV_NULL_PTR ),
+        error_entry( NCV_INCONSISTENT_INPUT ),
+        error_entry( NCV_TEXTURE_BIND_ERROR ),
+        error_entry( NCV_DIMENSIONS_INVALID ),
+        error_entry( NCV_INVALID_ROI ),
+        error_entry( NCV_INVALID_STEP ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
+        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
+        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
+        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
+        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
+        error_entry( NCV_MEM_COPY_ERROR ),
+        error_entry( NCV_MEM_RESIDENCE_ERROR ),
+        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
+        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
+        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
+        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
+        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
+        error_entry( NPPST_SUCCESS ),
+        error_entry( NPPST_ERROR ),
+        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPPST_NULL_POINTER_ERROR ),
+        error_entry( NPPST_TEXTURE_BIND_ERROR ),
+        error_entry( NPPST_MEMCPY_ERROR ),
+        error_entry( NPPST_MEM_ALLOC_ERR ),
+        error_entry( NPPST_MEMFREE_ERR ),
+        error_entry( NPPST_INVALID_ROI ),
+        error_entry( NPPST_INVALID_STEP ),
+        error_entry( NPPST_INVALID_SCALE ),
+        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
+        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
+        error_entry( NPPST_MEM_INTERNAL_ERROR )
+    };
+
+    const size_t ncv_error_num = sizeof(ncv_errors) / sizeof(ncv_errors[0]);
+}
+
+cv::String cv::gpu::getNcvErrorMessage(int code)
+{
+    size_t idx = std::find_if(ncv_errors, ncv_errors + ncv_error_num, ErrorEntryComparer(code)) - ncv_errors;
+
+    const char* msg = (idx != ncv_error_num) ? ncv_errors[idx].str : "Unknown error code";
+    String str = cv::format("%s [Code = %d]", msg, code);
+
+    return str;
+}
+
+
+static void stdDebugOutput(const cv::String &msg)
+{
+    std::cout << msg.c_str() << std::endl;
+}
+
+
+static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;
+
+
+void ncvDebugOutput(const cv::String &msg)
+{
+    debugOutputHandler(msg);
+}
+
+
+void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func)
+{
+    debugOutputHandler = func;
+}
+
+
+//==============================================================================
+//
+// Memory wrappers and helpers
+//
+//==============================================================================
+
+
+Ncv32u alignUp(Ncv32u what, Ncv32u alignment)
+{
+    Ncv32u alignMask = alignment-1;
+    Ncv32u inverseAlignMask = ~alignMask;
+    Ncv32u res = (what + alignMask) & inverseAlignMask;
+    return res;
+}
+
+
+void NCVMemPtr::clear()
+{
+    ptr = NULL;
+    memtype = NCVMemoryTypeNone;
+}
+
+
+void NCVMemSegment::clear()
+{
+    begin.clear();
+    size = 0;
+}
+
+
+NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NCVMemoryType srcType, size_t sz, cudaStream_t cuStream)
+{
+    NCVStatus ncvStat;
+    switch (dstType)
+    {
+    case NCVMemoryTypeHostPageable:
+    case NCVMemoryTypeHostPinned:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            memcpy(dst, src, sz);
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    case NCVMemoryTypeDevice:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpyAsync(dst, src, sz, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy(dst, src, sz, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    default:
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;
+    }
+
+    return ncvStat;
+}
+
+
+NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
+                             const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
+                             Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
+{
+    NCVStatus ncvStat;
+    switch (dstType)
+    {
+    case NCVMemoryTypeHostPageable:
+    case NCVMemoryTypeHostPinned:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            for (Ncv32u i=0; i<height; i++)
+            {
+                memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    case NCVMemoryTypeDevice:
+        switch (srcType)
+        {
+        case NCVMemoryTypeHostPageable:
+        case NCVMemoryTypeHostPinned:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        case NCVMemoryTypeDevice:
+            if (cuStream != 0)
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
+            }
+            else
+            {
+                ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
+            }
+            ncvStat = NCV_SUCCESS;
+            break;
+        default:
+            ncvStat = NCV_MEM_RESIDENCE_ERROR;
+        }
+        break;
+    default:
+        ncvStat = NCV_MEM_RESIDENCE_ERROR;
+    }
+
+    return ncvStat;
+}
+
+
+//===================================================================
+//
+// NCVMemStackAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_) :
+    _memType(NCVMemoryTypeNone),
+    _alignment(alignment_),
+    allocBegin(NULL),
+    begin(NULL),
+    end(NULL),
+    currentSize(0),
+    _maxSize(0),
+    bReusesMemory(false)
+{
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
+}
+
+
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr) :
+    _memType(memT),
+    _alignment(alignment_),
+    allocBegin(NULL),
+    currentSize(0),
+    _maxSize(0)
+{
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
+    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
+    ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
+
+    allocBegin = NULL;
+
+    if (reusePtr == NULL && capacity != 0)
+    {
+        bReusesMemory = false;
+        switch (memT)
+        {
+        case NCVMemoryTypeDevice:
+            ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPinned:
+            ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), );
+            break;
+        case NCVMemoryTypeHostPageable:
+            allocBegin = (Ncv8u *)malloc(capacity);
+            break;
+        default:;
+        }
+    }
+    else
+    {
+        bReusesMemory = true;
+        allocBegin = (Ncv8u *)reusePtr;
+    }
+
+    if (capacity == 0)
+    {
+        allocBegin = (Ncv8u *)(0x1);
+    }
+
+    if (!isCounting())
+    {
+        begin = allocBegin;
+        end = begin + capacity;
+    }
+}
+
+
+NCVMemStackAllocator::~NCVMemStackAllocator()
+{
+    if (allocBegin != NULL)
+    {
+        ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction");
+
+        if (!bReusesMemory && (allocBegin != (Ncv8u *)(0x1)))
+        {
+            switch (_memType)
+            {
+            case NCVMemoryTypeDevice:
+                ncvAssertCUDAReturn(cudaFree(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPinned:
+                ncvAssertCUDAReturn(cudaFreeHost(allocBegin), );
+                break;
+            case NCVMemoryTypeHostPageable:
+                free(allocBegin);
+                break;
+            default:;
+            }
+        }
+
+        allocBegin = NULL;
+    }
+}
+
+
+NCVStatus NCVMemStackAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    size = alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    this->currentSize += size;
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    if (!isCounting())
+    {
+        size_t availSize = end - begin;
+        ncvAssertReturn(size <= availSize, NCV_ALLOCATOR_INSUFFICIENT_CAPACITY);
+    }
+
+    seg.begin.ptr = begin;
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+    begin += size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemStackAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL || isCounting(), NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr == begin - seg.size, NCV_ALLOCATOR_DEALLOC_ORDER);
+
+    currentSize -= seg.size;
+    begin -= seg.size;
+
+    seg.clear();
+
+    ncvAssertReturn(allocBegin <= begin, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemStackAllocator::isInitialized(void) const
+{
+    return (((this->_alignment & (this->_alignment-1)) == 0) && isCounting()) || this->allocBegin != NULL;
+}
+
+
+NcvBool NCVMemStackAllocator::isCounting(void) const
+{
+    return this->_memType == NCVMemoryTypeNone;
+}
+
+
+NCVMemoryType NCVMemStackAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemStackAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemStackAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// NCVMemNativeAllocator class members implementation
+//
+//===================================================================
+
+
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_) :
+    _memType(memT),
+    _alignment(alignment_),
+    currentSize(0),
+    _maxSize(0)
+{
+    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
+}
+
+
+NCVMemNativeAllocator::~NCVMemNativeAllocator()
+{
+    ncvAssertPrintCheck(currentSize == 0, "NCVMemNativeAllocator dtor:: detected memory leak");
+}
+
+
+NCVStatus NCVMemNativeAllocator::alloc(NCVMemSegment &seg, size_t size)
+{
+    seg.clear();
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaMalloc(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaMallocHost(&seg.begin.ptr, size), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        seg.begin.ptr = (Ncv8u *)malloc(size);
+        break;
+    default:;
+    }
+
+    this->currentSize += alignUp(static_cast<Ncv32u>(size), this->_alignment);
+    this->_maxSize = std::max(this->_maxSize, this->currentSize);
+
+    seg.begin.memtype = this->_memType;
+    seg.size = size;
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus NCVMemNativeAllocator::dealloc(NCVMemSegment &seg)
+{
+    ncvAssertReturn(isInitialized(), NCV_ALLOCATOR_BAD_ALLOC);
+    ncvAssertReturn(seg.begin.memtype == this->_memType, NCV_ALLOCATOR_BAD_DEALLOC);
+    ncvAssertReturn(seg.begin.ptr != NULL, NCV_ALLOCATOR_BAD_DEALLOC);
+
+    ncvAssertReturn(currentSize >= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment), NCV_ALLOCATOR_BAD_DEALLOC);
+    currentSize -= alignUp(static_cast<Ncv32u>(seg.size), this->_alignment);
+
+    switch (this->_memType)
+    {
+    case NCVMemoryTypeDevice:
+        ncvAssertCUDAReturn(cudaFree(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPinned:
+        ncvAssertCUDAReturn(cudaFreeHost(seg.begin.ptr), NCV_CUDA_ERROR);
+        break;
+    case NCVMemoryTypeHostPageable:
+        free(seg.begin.ptr);
+        break;
+    default:;
+    }
+
+    seg.clear();
+
+    return NCV_SUCCESS;
+}
+
+
+NcvBool NCVMemNativeAllocator::isInitialized(void) const
+{
+    return (this->_alignment != 0);
+}
+
+
+NcvBool NCVMemNativeAllocator::isCounting(void) const
+{
+    return false;
+}
+
+
+NCVMemoryType NCVMemNativeAllocator::memType(void) const
+{
+    return this->_memType;
+}
+
+
+Ncv32u NCVMemNativeAllocator::alignment(void) const
+{
+    return this->_alignment;
+}
+
+
+size_t NCVMemNativeAllocator::maxSize(void) const
+{
+    return this->_maxSize;
+}
+
+
+//===================================================================
+//
+// Time and timer routines
+//
+//===================================================================
+
+
+typedef struct _NcvTimeMoment NcvTimeMoment;
+
+#if defined(_WIN32) || defined(_WIN64)
+
+    #include <Windows.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        LONGLONG moment, freq;
+    } NcvTimeMoment;
+
+
+    static void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        QueryPerformanceFrequency((LARGE_INTEGER *)&(t->freq));
+        QueryPerformanceCounter((LARGE_INTEGER *)&(t->moment));
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->moment / t->freq;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+
+    double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return 1000.0 * 2 * ((t2->moment) - (t1->moment)) / (t1->freq + t2->freq);
+    }
+
+#elif defined(__GNUC__)
+
+    #include <sys/time.h>
+
+    typedef struct _NcvTimeMoment
+    {
+        struct timeval tv;
+        struct timezone tz;
+    } NcvTimeMoment;
+
+
+    void _ncvQueryMoment(NcvTimeMoment *t)
+    {
+        gettimeofday(& t->tv, & t->tz);
+    }
+
+
+    double _ncvMomentToMicroseconds(NcvTimeMoment *t)
+    {
+        return 1000000.0 * t->tv.tv_sec + (double)t->tv.tv_usec;
+    }
+
+
+    double _ncvMomentsDiffToMicroseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return (((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000000 + (double)t2->tv.tv_usec - (double)t1->tv.tv_usec);
+    }
+
+    double _ncvMomentsDiffToMilliseconds(NcvTimeMoment *t1, NcvTimeMoment *t2)
+    {
+        return ((double)t2->tv.tv_sec - (double)t1->tv.tv_sec) * 1000;
+    }
+
+#endif //#if defined(_WIN32) || defined(_WIN64)
+
+
+struct _NcvTimer
+{
+    NcvTimeMoment t1, t2;
+};
+
+
+NcvTimer ncvStartTimer(void)
+{
+    struct _NcvTimer *t;
+    t = (struct _NcvTimer *)malloc(sizeof(struct _NcvTimer));
+    _ncvQueryMoment(&t->t1);
+    return t;
+}
+
+
+double ncvEndQueryTimerUs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMicroseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+
+double ncvEndQueryTimerMs(NcvTimer t)
+{
+    double res;
+    _ncvQueryMoment(&t->t2);
+    res = _ncvMomentsDiffToMilliseconds(&t->t1, &t->t2);
+    free(t);
+    return res;
+}
+
+
+//===================================================================
+//
+// Operations with rectangles
+//
+//===================================================================
+
+struct RectConvert
+{
+    cv::Rect operator()(const NcvRect32u& nr) const { return cv::Rect(nr.x, nr.y, nr.width, nr.height); }
+    NcvRect32u operator()(const cv::Rect& nr) const
+    {
+        NcvRect32u rect;
+        rect.x = nr.x;
+        rect.y = nr.y;
+        rect.width = nr.width;
+        rect.height = nr.height;
+        return rect;
+    }
+};
+
+static void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)
+{
+#ifndef HAVE_OPENCV_OBJDETECT
+    (void) hypotheses;
+    (void) groupThreshold;
+    (void) eps;
+    (void) weights;
+    CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module");
+#else
+    std::vector<cv::Rect> rects(hypotheses.size());
+    std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());
+
+    if (weights)
+    {
+        std::vector<int> weights_int;
+        weights_int.assign(weights->begin(), weights->end());
+        cv::groupRectangles(rects, weights_int, groupThreshold, eps);
+    }
+    else
+    {
+        cv::groupRectangles(rects, groupThreshold, eps);
+    }
+    std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());
+    hypotheses.resize(rects.size());
+#endif
+}
+
+
+
+NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses,
+                                  Ncv32u &numHypotheses,
+                                  Ncv32u minNeighbors,
+                                  Ncv32f intersectEps,
+                                  NCVVector<Ncv32u> *hypothesesWeights)
+{
+    ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
+                    hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+    if (hypothesesWeights != NULL)
+    {
+        ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
+                        hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
+    }
+
+    if (numHypotheses == 0)
+    {
+        return NCV_SUCCESS;
+    }
+
+    std::vector<NcvRect32u> rects(numHypotheses);
+    memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
+
+    std::vector<Ncv32u> weights;
+    if (hypothesesWeights != NULL)
+    {
+        groupRectangles(rects, minNeighbors, intersectEps, &weights);
+    }
+    else
+    {
+        groupRectangles(rects, minNeighbors, intersectEps, NULL);
+    }
+
+    numHypotheses = (Ncv32u)rects.size();
+    if (numHypotheses > 0)
+    {
+        memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
+    }
+
+    if (hypothesesWeights != NULL)
+    {
+        memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
+    }
+
+    return NCV_SUCCESS;
+}
+
+
+template <class T>
+static NCVStatus drawRectsWrapperHost(T *h_dst,
+                                      Ncv32u dstStride,
+                                      Ncv32u dstWidth,
+                                      Ncv32u dstHeight,
+                                      NcvRect32u *h_rects,
+                                      Ncv32u numRects,
+                                      T color)
+{
+    ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
+    ncvAssertReturn(numRects != 0, NCV_SUCCESS);
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
+
+    for (Ncv32u i=0; i<numRects; i++)
+    {
+        NcvRect32u rect = h_rects[i];
+
+        if (rect.x < dstWidth)
+        {
+            for (Ncv32u each=rect.y; each<rect.y+rect.height && each<dstHeight; each++)
+            {
+                h_dst[each*dstStride+rect.x] = color;
+            }
+        }
+        if (rect.x+rect.width-1 < dstWidth)
+        {
+            for (Ncv32u each=rect.y; each<rect.y+rect.height && each<dstHeight; each++)
+            {
+                h_dst[each*dstStride+rect.x+rect.width-1] = color;
+            }
+        }
+        if (rect.y < dstHeight)
+        {
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
+            {
+                h_dst[rect.y*dstStride+j] = color;
+            }
+        }
+        if (rect.y + rect.height - 1 < dstHeight)
+        {
+            for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
+            {
+                h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
+            }
+        }
+    }
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
+                               Ncv32u dstStride,
+                               Ncv32u dstWidth,
+                               Ncv32u dstHeight,
+                               NcvRect32u *h_rects,
+                               Ncv32u numRects,
+                               Ncv8u color)
+{
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
+}
+
+
+NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
+                                Ncv32u dstStride,
+                                Ncv32u dstWidth,
+                                Ncv32u dstHeight,
+                                NcvRect32u *h_rects,
+                                Ncv32u numRects,
+                                Ncv32u color)
+{
+    return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
+}
--- a/modules/gpulegacy/src/cuda/NCV.cu
+++ b/modules/gpulegacy/src/cuda/NCV.cu
@@ -0,0 +1,180 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <iostream>
+#include <vector>
+
+#include "opencv2/gpulegacy/NCV.hpp"
+
+//===================================================================
+//
+// Operations with rectangles
+//
+//===================================================================
+
+
+const Ncv32u NUMTHREADS_DRAWRECTS = 32;
+const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
+
+
+template <class T>
+__global__ void drawRects(T *d_dst,
+                          Ncv32u dstStride,
+                          Ncv32u dstWidth,
+                          Ncv32u dstHeight,
+                          NcvRect32u *d_rects,
+                          Ncv32u numRects,
+                          T color)
+{
+    Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
+    if (blockId > numRects * 4)
+    {
+        return;
+    }
+
+    NcvRect32u curRect = d_rects[blockId >> 2];
+    NcvBool bVertical = blockId & 0x1;
+    NcvBool bTopLeft = blockId & 0x2;
+
+    Ncv32u pt0x, pt0y;
+    if (bVertical)
+    {
+        Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
+
+        pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
+        pt0y = curRect.y;
+
+        if (pt0x < dstWidth)
+        {
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
+            {
+                Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
+                if (ptY < pt0y + curRect.height && ptY < dstHeight)
+                {
+                    d_dst[ptY * dstStride + pt0x] = color;
+                }
+            }
+        }
+    }
+    else
+    {
+        Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
+
+        pt0x = curRect.x;
+        pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
+
+        if (pt0y < dstHeight)
+        {
+            for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
+            {
+                Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
+                if (ptX < pt0x + curRect.width && ptX < dstWidth)
+                {
+                    d_dst[pt0y * dstStride + ptX] = color;
+                }
+            }
+        }
+    }
+}
+
+
+template <class T>
+static NCVStatus drawRectsWrapperDevice(T *d_dst,
+                                        Ncv32u dstStride,
+                                        Ncv32u dstWidth,
+                                        Ncv32u dstHeight,
+                                        NcvRect32u *d_rects,
+                                        Ncv32u numRects,
+                                        T color,
+                                        cudaStream_t cuStream)
+{
+    (void)cuStream;
+    ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
+    ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
+    ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
+
+    if (numRects == 0)
+    {
+        return NCV_SUCCESS;
+    }
+
+    dim3 grid(numRects * 4);
+    dim3 block(NUMTHREADS_DRAWRECTS);
+    if (grid.x > 65535)
+    {
+        grid.y = (grid.x + 65534) / 65535;
+        grid.x = 65535;
+    }
+
+    drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
+
+    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+
+    return NCV_SUCCESS;
+}
+
+
+NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
+                                 Ncv32u dstStride,
+                                 Ncv32u dstWidth,
+                                 Ncv32u dstHeight,
+                                 NcvRect32u *d_rects,
+                                 Ncv32u numRects,
+                                 Ncv8u color,
+                                 cudaStream_t cuStream)
+{
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
+}
+
+
+NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
+                                  Ncv32u dstStride,
+                                  Ncv32u dstWidth,
+                                  Ncv32u dstHeight,
+                                  NcvRect32u *d_rects,
+                                  Ncv32u numRects,
+                                  Ncv32u color,
+                                  cudaStream_t cuStream)
+{
+    return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
+}
--- a/modules/gpulegacy/src/cuda/NCVAlg.hpp
+++ b/modules/gpulegacy/src/cuda/NCVAlg.hpp
@@ -0,0 +1,155 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_alg_hpp_
+#define _ncv_alg_hpp_
+
+#include "opencv2/gpulegacy/NCV.hpp"
+
+
+template <class T>
+static void swap(T &p1, T &p2)
+{
+    T tmp = p1;
+    p1 = p2;
+    p2 = tmp;
+}
+
+
+template<typename T>
+static T divUp(T a, T b)
+{
+    return (a + b - 1) / b;
+}
+
+
+template<typename T>
+struct functorAddValues
+{
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
+        in1out += in2;
+    }
+};
+
+
+template<typename T>
+struct functorMinValues
+{
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
+        in1out = in1out > in2 ? in2 : in1out;
+    }
+};
+
+
+template<typename T>
+struct functorMaxValues
+{
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
+        in1out = in1out > in2 ? in1out : in2;
+    }
+};
+
+
+template<typename Tdata, class Tfunc, Ncv32u nThreads>
+static __device__ Tdata subReduce(Tdata threadElem)
+{
+    Tfunc functor;
+
+    __shared__ Tdata _reduceArr[nThreads];
+    volatile Tdata *reduceArr = _reduceArr;
+    functor.assign(reduceArr + threadIdx.x, &threadElem);
+    __syncthreads();
+
+    if (nThreads >= 256 && threadIdx.x < 128)
+    {
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 128]);
+    }
+    __syncthreads();
+
+    if (nThreads >= 128 && threadIdx.x < 64)
+    {
+        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 64]);
+    }
+    __syncthreads();
+
+    if (threadIdx.x < 32)
+    {
+        if (nThreads >= 64)
+        {
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
+        }
+        if (nThreads >= 32 && threadIdx.x < 16)
+        {
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
+        }
+    }
+
+    __syncthreads();
+    Tdata reduceRes;
+    functor.assign(&reduceRes, reduceArr);
+    return reduceRes;
+}
+
+
+#endif //_ncv_alg_hpp_
--- a/modules/gpulegacy/src/cuda/NCVBroxOpticalFlow.cu
+++ b/modules/gpulegacy/src/cuda/NCVBroxOpticalFlow.cu
--- a/modules/gpulegacy/src/cuda/NCVColorConversion.hpp
+++ b/modules/gpulegacy/src/cuda/NCVColorConversion.hpp
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+// this file does not contain any used code.
+
+#ifndef _ncv_color_conversion_hpp_
+#define _ncv_color_conversion_hpp_
+
+#include "NCVPixelOperations.hpp"
+
+#if 0
+enum NCVColorSpace
+{
+    NCVColorSpaceGray,
+    NCVColorSpaceRGBA,
+};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout> struct __pixColorConv {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut);
+};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceRGBA, NCVColorSpaceGray, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    Ncv32f luma = 0.299f * pixIn.x + 0.587f * pixIn.y + 0.114f * pixIn.z;
+    _TDemoteClampNN(luma, pixOut.x);
+}};
+
+template<typename Tin, typename Tout> struct __pixColorConv<NCVColorSpaceGray, NCVColorSpaceRGBA, Tin, Tout> {
+static void _pixColorConv(const Tin &pixIn, Tout &pixOut)
+{
+    _TDemoteClampNN(pixIn.x, pixOut.x);
+    _TDemoteClampNN(pixIn.x, pixOut.y);
+    _TDemoteClampNN(pixIn.x, pixOut.z);
+    pixOut.w = 0;
+}};
+
+template<NCVColorSpace CSin, NCVColorSpace CSout, typename Tin, typename Tout>
+static NCVStatus _ncvColorConv_host(const NCVMatrix<Tin> &h_imgIn,
+                             const NCVMatrix<Tout> &h_imgOut)
+{
+    ncvAssertReturn(h_imgIn.size() == h_imgOut.size(), NCV_DIMENSIONS_INVALID);
+    ncvAssertReturn(h_imgIn.memType() == h_imgOut.memType() &&
+                    (h_imgIn.memType() == NCVMemoryTypeHostPinned || h_imgIn.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
+    NCV_SET_SKIP_COND(h_imgIn.memType() == NCVMemoryTypeNone);
+    NCV_SKIP_COND_BEGIN
+
+    for (Ncv32u i=0; i<h_imgIn.height(); i++)
+    {
+        for (Ncv32u j=0; j<h_imgIn.width(); j++)
+        {
+            __pixColorConv<CSin, CSout, Tin, Tout>::_pixColorConv(h_imgIn.at(j,i), h_imgOut.at(j,i));
+        }
+    }
+
+    NCV_SKIP_COND_END
+    return NCV_SUCCESS;
+}
+#endif
+
+#endif //_ncv_color_conversion_hpp_
--- a/modules/gpulegacy/src/cuda/NCVHaarObjectDetection.cu
+++ b/modules/gpulegacy/src/cuda/NCVHaarObjectDetection.cu
--- a/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp
+++ b/modules/gpulegacy/src/cuda/NCVPixelOperations.hpp
@@ -0,0 +1,351 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncv_pixel_operations_hpp_
+#define _ncv_pixel_operations_hpp_
+
+#include <limits.h>
+#include <float.h>
+#include "opencv2/gpulegacy/NCV.hpp"
+
+template<typename TBase> inline __host__ __device__ TBase _pixMaxVal();
+template<> static inline __host__ __device__ Ncv8u  _pixMaxVal<Ncv8u>()  {return UCHAR_MAX;}
+template<> static inline __host__ __device__ Ncv16u _pixMaxVal<Ncv16u>() {return USHRT_MAX;}
+template<> static inline __host__ __device__ Ncv32u _pixMaxVal<Ncv32u>() {return  UINT_MAX;}
+template<> static inline __host__ __device__ Ncv8s  _pixMaxVal<Ncv8s>()  {return  CHAR_MAX;}
+template<> static inline __host__ __device__ Ncv16s _pixMaxVal<Ncv16s>() {return  SHRT_MAX;}
+template<> static inline __host__ __device__ Ncv32s _pixMaxVal<Ncv32s>() {return   INT_MAX;}
+template<> static inline __host__ __device__ Ncv32f _pixMaxVal<Ncv32f>() {return   FLT_MAX;}
+template<> static inline __host__ __device__ Ncv64f _pixMaxVal<Ncv64f>() {return   DBL_MAX;}
+
+template<typename TBase> inline __host__ __device__ TBase _pixMinVal();
+template<> static inline __host__ __device__ Ncv8u  _pixMinVal<Ncv8u>()  {return 0;}
+template<> static inline __host__ __device__ Ncv16u _pixMinVal<Ncv16u>() {return 0;}
+template<> static inline __host__ __device__ Ncv32u _pixMinVal<Ncv32u>() {return 0;}
+template<> static inline __host__ __device__ Ncv8s  _pixMinVal<Ncv8s>()  {return CHAR_MIN;}
+template<> static inline __host__ __device__ Ncv16s _pixMinVal<Ncv16s>() {return SHRT_MIN;}
+template<> static inline __host__ __device__ Ncv32s _pixMinVal<Ncv32s>() {return INT_MIN;}
+template<> static inline __host__ __device__ Ncv32f _pixMinVal<Ncv32f>() {return FLT_MIN;}
+template<> static inline __host__ __device__ Ncv64f _pixMinVal<Ncv64f>() {return DBL_MIN;}
+
+template<typename Tvec> struct TConvVec2Base;
+template<> struct TConvVec2Base<uchar1>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar3>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<uchar4>  {typedef Ncv8u TBase;};
+template<> struct TConvVec2Base<ushort1> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort3> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<ushort4> {typedef Ncv16u TBase;};
+template<> struct TConvVec2Base<uint1>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint3>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<uint4>   {typedef Ncv32u TBase;};
+template<> struct TConvVec2Base<float1>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float3>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<float4>  {typedef Ncv32f TBase;};
+template<> struct TConvVec2Base<double1> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double3> {typedef Ncv64f TBase;};
+template<> struct TConvVec2Base<double4> {typedef Ncv64f TBase;};
+
+#define NC(T)       (sizeof(T) / sizeof(TConvVec2Base<T>::TBase))
+
+template<typename TBase, Ncv32u NC> struct TConvBase2Vec;
+template<> struct TConvBase2Vec<Ncv8u, 1>  {typedef uchar1 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 3>  {typedef uchar3 TVec;};
+template<> struct TConvBase2Vec<Ncv8u, 4>  {typedef uchar4 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 1> {typedef ushort1 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 3> {typedef ushort3 TVec;};
+template<> struct TConvBase2Vec<Ncv16u, 4> {typedef ushort4 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 1> {typedef uint1 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 3> {typedef uint3 TVec;};
+template<> struct TConvBase2Vec<Ncv32u, 4> {typedef uint4 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 1> {typedef float1 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 3> {typedef float3 TVec;};
+template<> struct TConvBase2Vec<Ncv32f, 4> {typedef float4 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 1> {typedef double1 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 3> {typedef double3 TVec;};
+template<> struct TConvBase2Vec<Ncv64f, 4> {typedef double4 TVec;};
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a);};
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a, 0, USHRT_MAX);}
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a, 0, UINT_MAX);}
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampZ(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+//TODO: consider using CUDA intrinsics to avoid branching
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv8u &out) {out = (Ncv8u)CLAMP_0_255(a+0.5f);}
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv16u &out) {out = (Ncv16u)CLAMP(a+0.5f, 0, USHRT_MAX);}
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32u &out) {out = (Ncv32u)CLAMP(a+0.5f, 0, UINT_MAX);}
+template<typename Tin> static inline __host__ __device__ void _TDemoteClampNN(Tin &a, Ncv32f &out) {out = (Ncv32f)a;}
+
+template<typename Tout> inline Tout _pixMakeZero();
+template<> static inline __host__ __device__ uchar1 _pixMakeZero<uchar1>() {return make_uchar1(0);}
+template<> static inline __host__ __device__ uchar3 _pixMakeZero<uchar3>() {return make_uchar3(0,0,0);}
+template<> static inline __host__ __device__ uchar4 _pixMakeZero<uchar4>() {return make_uchar4(0,0,0,0);}
+template<> static inline __host__ __device__ ushort1 _pixMakeZero<ushort1>() {return make_ushort1(0);}
+template<> static inline __host__ __device__ ushort3 _pixMakeZero<ushort3>() {return make_ushort3(0,0,0);}
+template<> static inline __host__ __device__ ushort4 _pixMakeZero<ushort4>() {return make_ushort4(0,0,0,0);}
+template<> static inline __host__ __device__ uint1 _pixMakeZero<uint1>() {return make_uint1(0);}
+template<> static inline __host__ __device__ uint3 _pixMakeZero<uint3>() {return make_uint3(0,0,0);}
+template<> static inline __host__ __device__ uint4 _pixMakeZero<uint4>() {return make_uint4(0,0,0,0);}
+template<> static inline __host__ __device__ float1 _pixMakeZero<float1>() {return make_float1(0.f);}
+template<> static inline __host__ __device__ float3 _pixMakeZero<float3>() {return make_float3(0.f,0.f,0.f);}
+template<> static inline __host__ __device__ float4 _pixMakeZero<float4>() {return make_float4(0.f,0.f,0.f,0.f);}
+template<> static inline __host__ __device__ double1 _pixMakeZero<double1>() {return make_double1(0.);}
+template<> static inline __host__ __device__ double3 _pixMakeZero<double3>() {return make_double3(0.,0.,0.);}
+template<> static inline __host__ __device__ double4 _pixMakeZero<double4>() {return make_double4(0.,0.,0.,0.);}
+
+static inline __host__ __device__ uchar1 _pixMake(Ncv8u x) {return make_uchar1(x);}
+static inline __host__ __device__ uchar3 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z) {return make_uchar3(x,y,z);}
+static inline __host__ __device__ uchar4 _pixMake(Ncv8u x, Ncv8u y, Ncv8u z, Ncv8u w) {return make_uchar4(x,y,z,w);}
+static inline __host__ __device__ ushort1 _pixMake(Ncv16u x) {return make_ushort1(x);}
+static inline __host__ __device__ ushort3 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z) {return make_ushort3(x,y,z);}
+static inline __host__ __device__ ushort4 _pixMake(Ncv16u x, Ncv16u y, Ncv16u z, Ncv16u w) {return make_ushort4(x,y,z,w);}
+static inline __host__ __device__ uint1 _pixMake(Ncv32u x) {return make_uint1(x);}
+static inline __host__ __device__ uint3 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z) {return make_uint3(x,y,z);}
+static inline __host__ __device__ uint4 _pixMake(Ncv32u x, Ncv32u y, Ncv32u z, Ncv32u w) {return make_uint4(x,y,z,w);}
+static inline __host__ __device__ float1 _pixMake(Ncv32f x) {return make_float1(x);}
+static inline __host__ __device__ float3 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z) {return make_float3(x,y,z);}
+static inline __host__ __device__ float4 _pixMake(Ncv32f x, Ncv32f y, Ncv32f z, Ncv32f w) {return make_float4(x,y,z,w);}
+static inline __host__ __device__ double1 _pixMake(Ncv64f x) {return make_double1(x);}
+static inline __host__ __device__ double3 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z) {return make_double3(x,y,z);}
+static inline __host__ __device__ double4 _pixMake(Ncv64f x, Ncv64f y, Ncv64f z, Ncv64f w) {return make_double4(x,y,z,w);}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampZ_CN {static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampZ_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixDemoteClampZ_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampZ(pix.x, out.x);
+    _TDemoteClampZ(pix.y, out.y);
+    _TDemoteClampZ(pix.z, out.z);
+    _TDemoteClampZ(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> static inline __host__ __device__ Tout _pixDemoteClampZ(Tin &pix)
+{
+    return __pixDemoteClampZ_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampZ_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDemoteClampNN_CN {static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix);};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixDemoteClampNN_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixDemoteClampNN_CN(Tin &pix)
+{
+    Tout out;
+    _TDemoteClampNN(pix.x, out.x);
+    _TDemoteClampNN(pix.y, out.y);
+    _TDemoteClampNN(pix.z, out.z);
+    _TDemoteClampNN(pix.w, out.w);
+    return out;
+}};
+
+template<typename Tin, typename Tout> static inline __host__ __device__ Tout _pixDemoteClampNN(Tin &pix)
+{
+    return __pixDemoteClampNN_CN<Tin, Tout, NC(Tin)>::_pixDemoteClampNN_CN(pix);
+}
+
+
+template<typename Tin, typename Tout, typename Tw, Ncv32u CN> struct __pixScale_CN {static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w);};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 1> {
+static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 3> {
+static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> struct __pixScale_CN<Tin, Tout, Tw, 4> {
+static __host__ __device__ Tout _pixScale_CN(Tin &pix, Tw w)
+{
+    Tout out;
+    typedef typename TConvVec2Base<Tout>::TBase TBout;
+    out.x = (TBout)(pix.x * w);
+    out.y = (TBout)(pix.y * w);
+    out.z = (TBout)(pix.z * w);
+    out.w = (TBout)(pix.w * w);
+    return out;
+}};
+
+template<typename Tin, typename Tout, typename Tw> static __host__ __device__ Tout _pixScale(Tin &pix, Tw w)
+{
+    return __pixScale_CN<Tin, Tout, Tw, NC(Tin)>::_pixScale_CN(pix, w);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixAdd_CN {static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    return out;
+}};
+
+template<typename Tin, typename Tout> struct __pixAdd_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixAdd_CN(Tout &pix1, Tin &pix2)
+{
+    Tout out;
+    out.x = pix1.x + pix2.x;
+    out.y = pix1.y + pix2.y;
+    out.z = pix1.z + pix2.z;
+    out.w = pix1.w + pix2.w;
+    return out;
+}};
+
+template<typename Tin, typename Tout> static __host__ __device__ Tout _pixAdd(Tout &pix1, Tin &pix2)
+{
+    return __pixAdd_CN<Tin, Tout, NC(Tin)>::_pixAdd_CN(pix1, pix2);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __pixDist_CN {static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2);};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z));
+}};
+
+template<typename Tin, typename Tout> struct __pixDist_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _pixDist_CN(Tin &pix1, Tin &pix2)
+{
+    return Tout(SQR(pix1.x - pix2.x) + SQR(pix1.y - pix2.y) + SQR(pix1.z - pix2.z) + SQR(pix1.w - pix2.w));
+}};
+
+template<typename Tin, typename Tout> static __host__ __device__ Tout _pixDist(Tin &pix1, Tin &pix2)
+{
+    return __pixDist_CN<Tin, Tout, NC(Tin)>::_pixDist_CN(pix1, pix2);
+}
+
+
+template <typename T> struct TAccPixWeighted;
+template<> struct TAccPixWeighted<uchar1> {typedef double1 type;};
+template<> struct TAccPixWeighted<uchar3> {typedef double3 type;};
+template<> struct TAccPixWeighted<uchar4> {typedef double4 type;};
+template<> struct TAccPixWeighted<ushort1> {typedef double1 type;};
+template<> struct TAccPixWeighted<ushort3> {typedef double3 type;};
+template<> struct TAccPixWeighted<ushort4> {typedef double4 type;};
+template<> struct TAccPixWeighted<float1> {typedef double1 type;};
+template<> struct TAccPixWeighted<float3> {typedef double3 type;};
+template<> struct TAccPixWeighted<float4> {typedef double4 type;};
+
+template<typename Tfrom> struct TAccPixDist {};
+template<> struct TAccPixDist<uchar1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<uchar4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort1> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort3> {typedef Ncv32u type;};
+template<> struct TAccPixDist<ushort4> {typedef Ncv32u type;};
+template<> struct TAccPixDist<float1> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float3> {typedef Ncv32f type;};
+template<> struct TAccPixDist<float4> {typedef Ncv32f type;};
+
+#endif //_ncv_pixel_operations_hpp_
--- a/modules/gpulegacy/src/cuda/NCVPyramid.cu
+++ b/modules/gpulegacy/src/cuda/NCVPyramid.cu
@@ -0,0 +1,605 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+#include "opencv2/core/cuda/common.hpp"
+
+#include "opencv2/gpulegacy/NCV.hpp"
+#include "opencv2/gpulegacy/NCVPyramid.hpp"
+
+#include "NCVAlg.hpp"
+#include "NCVPixelOperations.hpp"
+
+template<typename T, Ncv32u CN> struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
+
+template<typename T> struct __average4_CN<T, 1> {
+static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float1, 1> {
+static __host__ __device__ float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
+{
+    float1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double1, 1> {
+static __host__ __device__ double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
+{
+    double1 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 3> {
+static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float3, 3> {
+static __host__ __device__ float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
+{
+    float3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double3, 3> {
+static __host__ __device__ double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
+{
+    double3 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    return out;
+}};
+
+template<typename T> struct __average4_CN<T, 4> {
+static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    T out;
+    out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
+    out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
+    out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
+    out.w = ((Ncv32s)p00.w + p01.w + p10.w + p11.w + 2) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<float4, 4> {
+static __host__ __device__ float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
+{
+    float4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<> struct __average4_CN<double4, 4> {
+static __host__ __device__ double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
+{
+    double4 out;
+    out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
+    out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
+    out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
+    out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
+    return out;
+}};
+
+template<typename T> static __host__ __device__ T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
+{
+    return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
+}
+
+
+template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
+static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
+static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
+static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
+{
+    typedef typename TConvVec2Base<Tout>::TBase TB;
+    return _pixMake(TB(b.x * d + a.x * (1 - d)),
+                    TB(b.y * d + a.y * (1 - d)),
+                    TB(b.z * d + a.z * (1 - d)),
+                    TB(b.w * d + a.w * (1 - d)));
+}};
+
+template<typename Tin, typename Tout> static __host__ __device__ Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
+{
+    return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
+}
+
+
+template<typename T>
+__global__ void kernelDownsampleX2(T *d_src,
+                                   Ncv32u srcPitch,
+                                   T *d_dst,
+                                   Ncv32u dstPitch,
+                                   NcvSize32u dstRoi)
+{
+    Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
+    Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < dstRoi.height && j < dstRoi.width)
+    {
+        T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
+        T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
+        T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
+
+        T p00 = d_src_line1[2*j+0];
+        T p01 = d_src_line1[2*j+1];
+        T p10 = d_src_line2[2*j+0];
+        T p11 = d_src_line2[2*j+1];
+
+        d_dst_line[j] = _average4(p00, p01, p10, p11);
+    }
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace pyramid
+    {
+        template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            dim3 bDim(16, 8);
+            dim3 gDim(divUp(src.cols, bDim.x), divUp(src.rows, bDim.y));
+
+            kernelDownsampleX2<<<gDim, bDim, 0, stream>>>((T*)src.data, static_cast<Ncv32u>(src.step),
+                (T*)dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void kernelDownsampleX2_gpu<uchar1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void kernelDownsampleX2_gpu<ushort1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void kernelDownsampleX2_gpu<float1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelDownsampleX2_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+
+
+
+template<typename T>
+__global__ void kernelInterpolateFrom1(T *d_srcTop,
+                                       Ncv32u srcTopPitch,
+                                       NcvSize32u szTopRoi,
+                                       T *d_dst,
+                                       Ncv32u dstPitch,
+                                       NcvSize32u dstRoi)
+{
+    Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
+    Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < dstRoi.height && j < dstRoi.width)
+    {
+        Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
+        Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
+        Ncv32u xl = (Ncv32u)ptTopX;
+        Ncv32u xh = xl+1;
+        Ncv32f dx = ptTopX - xl;
+        Ncv32u yl = (Ncv32u)ptTopY;
+        Ncv32u yh = yl+1;
+        Ncv32f dy = ptTopY - yl;
+
+        T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
+        T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
+        T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
+
+        T p00, p01, p10, p11;
+        p00 = d_src_line1[xl];
+        p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
+        p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
+        p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
+        typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+        TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
+        TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
+        TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
+        T outPix = _pixDemoteClampZ<TVFlt, T>(mixture);
+
+        d_dst_line[j] = outPix;
+    }
+}
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace pyramid
+    {
+        template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            dim3 bDim(16, 8);
+            dim3 gDim(divUp(dst.cols, bDim.x), divUp(dst.rows, bDim.y));
+
+            kernelInterpolateFrom1<<<gDim, bDim, 0, stream>>>((T*) src.data, static_cast<Ncv32u>(src.step), NcvSize32u(src.cols, src.rows),
+                (T*) dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void kernelInterpolateFrom1_gpu<uchar1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void kernelInterpolateFrom1_gpu<ushort1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void kernelInterpolateFrom1_gpu<float1>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void kernelInterpolateFrom1_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+
+#if 0 //def _WIN32
+
+template<typename T>
+static T _interpLinear(const T &a, const T &b, Ncv32f d)
+{
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
+    return _pixDemoteClampZ<TVFlt, T>(tmp);
+}
+
+
+template<typename T>
+static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
+{
+    Ncv32u xl = (Ncv32u)x;
+    Ncv32u xh = xl+1;
+    Ncv32f dx = x - xl;
+    Ncv32u yl = (Ncv32u)y;
+    Ncv32u yh = yl+1;
+    Ncv32f dy = y - yl;
+    T p00, p01, p10, p11;
+    p00 = refLayer.at(xl, yl);
+    p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
+    p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
+    p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
+    typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
+    TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
+    TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
+    TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
+    return _pixDemoteClampZ<TVFlt, T>(mixture);
+}
+
+template <class T>
+NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
+                                    Ncv8u numLayers,
+                                    INCVMemAllocator &alloc,
+                                    cudaStream_t cuStream)
+{
+    this->_isInitialized = false;
+    ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid::ctor error", );
+
+    this->layer0 = &img;
+    NcvSize32u szLastLayer(img.width(), img.height());
+    this->nLayers = 1;
+
+    NCV_SET_SKIP_COND(alloc.isCounting());
+    NcvBool bDeviceCode = alloc.memType() == NCVMemoryTypeDevice;
+
+    if (numLayers == 0)
+    {
+        numLayers = 255; //it will cut-off when any of the dimensions goes 1
+    }
+
+#ifdef SELF_CHECK_GPU
+    NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
+#endif
+
+    for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
+    {
+        NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
+        if (szCurLayer.width == 0 || szCurLayer.height == 0)
+        {
+            break;
+        }
+
+        this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
+        ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid::ctor error", );
+        this->nLayers++;
+
+        //fill in the layer
+        NCV_SKIP_COND_BEGIN
+
+        const NCVMatrix<T> *prevLayer = i == 0 ? this->layer0 : this->pyramid[i-1];
+        NCVMatrix<T> *curLayer = this->pyramid[i];
+
+        if (bDeviceCode)
+        {
+            dim3 bDim(16, 8);
+            dim3 gDim(divUp(szCurLayer.width, bDim.x), divUp(szCurLayer.height, bDim.y));
+            kernelDownsampleX2<<<gDim, bDim, 0, cuStream>>>(prevLayer->ptr(),
+                                                            prevLayer->pitch(),
+                                                            curLayer->ptr(),
+                                                            curLayer->pitch(),
+                                                            szCurLayer);
+            ncvAssertPrintReturn(cudaSuccess == cudaGetLastError(), "NCVImagePyramid::ctor error", );
+
+#ifdef SELF_CHECK_GPU
+            NCVMatrixAlloc<T> h_prevLayer(allocCPU, prevLayer->width(), prevLayer->height());
+            ncvAssertPrintReturn(h_prevLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
+            NCVMatrixAlloc<T> h_curLayer(allocCPU, curLayer->width(), curLayer->height());
+            ncvAssertPrintReturn(h_curLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
+            ncvAssertPrintReturn(NCV_SUCCESS == prevLayer->copy2D(h_prevLayer, prevLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
+            ncvAssertPrintReturn(NCV_SUCCESS == curLayer->copy2D(h_curLayer, curLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
+            ncvAssertPrintReturn(cudaSuccess == cudaStreamSynchronize(cuStream), "Validation failure in NCVImagePyramid::ctor", );
+            for (Ncv32u i=0; i<szCurLayer.height; i++)
+            {
+                for (Ncv32u j=0; j<szCurLayer.width; j++)
+                {
+                    T p00 = h_prevLayer.at(2*j+0, 2*i+0);
+                    T p01 = h_prevLayer.at(2*j+1, 2*i+0);
+                    T p10 = h_prevLayer.at(2*j+0, 2*i+1);
+                    T p11 = h_prevLayer.at(2*j+1, 2*i+1);
+                    T outGold = _average4(p00, p01, p10, p11);
+                    T outGPU = h_curLayer.at(j, i);
+                    ncvAssertPrintReturn(0 == memcmp(&outGold, &outGPU, sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelDownsampleX2", );
+                }
+            }
+#endif
+        }
+        else
+        {
+            for (Ncv32u i=0; i<szCurLayer.height; i++)
+            {
+                for (Ncv32u j=0; j<szCurLayer.width; j++)
+                {
+                    T p00 = prevLayer->at(2*j+0, 2*i+0);
+                    T p01 = prevLayer->at(2*j+1, 2*i+0);
+                    T p10 = prevLayer->at(2*j+0, 2*i+1);
+                    T p11 = prevLayer->at(2*j+1, 2*i+1);
+                    curLayer->at(j, i) = _average4(p00, p01, p10, p11);
+                }
+            }
+        }
+
+        NCV_SKIP_COND_END
+
+        szLastLayer = szCurLayer;
+    }
+
+    this->_isInitialized = true;
+}
+
+
+template <class T>
+NCVImagePyramid<T>::~NCVImagePyramid()
+{
+}
+
+
+template <class T>
+NcvBool NCVImagePyramid<T>::isInitialized() const
+{
+    return this->_isInitialized;
+}
+
+
+template <class T>
+NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
+                                       NcvSize32u outRoi,
+                                       NcvBool bTrilinear,
+                                       cudaStream_t cuStream) const
+{
+    ncvAssertReturn(this->isInitialized(), NCV_UNKNOWN_ERROR);
+    ncvAssertReturn(outImg.memType() == this->layer0->memType(), NCV_MEM_RESIDENCE_ERROR);
+    ncvAssertReturn(outRoi.width <= this->layer0->width() && outRoi.height <= this->layer0->height() &&
+                    outRoi.width > 0 && outRoi.height > 0, NCV_DIMENSIONS_INVALID);
+
+    if (outRoi.width == this->layer0->width() && outRoi.height == this->layer0->height())
+    {
+        ncvAssertReturnNcvStat(this->layer0->copy2D(outImg, NcvSize32u(this->layer0->width(), this->layer0->height()), cuStream));
+        return NCV_SUCCESS;
+    }
+
+    Ncv32f lastScale = 1.0f;
+    Ncv32f curScale;
+    const NCVMatrix<T> *lastLayer = this->layer0;
+    const NCVMatrix<T> *curLayer = NULL;
+    NcvBool bUse2Refs = false;
+
+    for (Ncv32u i=0; i<this->nLayers-1; i++)
+    {
+        curScale = lastScale * 0.5f;
+        curLayer = this->pyramid[i];
+
+        if (outRoi.width == curLayer->width() && outRoi.height == curLayer->height())
+        {
+            ncvAssertReturnNcvStat(this->pyramid[i]->copy2D(outImg, NcvSize32u(this->pyramid[i]->width(), this->pyramid[i]->height()), cuStream));
+            return NCV_SUCCESS;
+        }
+
+        if (outRoi.width >= curLayer->width() && outRoi.height >= curLayer->height())
+        {
+            if (outRoi.width < lastLayer->width() && outRoi.height < lastLayer->height())
+            {
+                bUse2Refs = true;
+            }
+            break;
+        }
+
+        lastScale = curScale;
+        lastLayer = curLayer;
+    }
+
+    bUse2Refs = bUse2Refs && bTrilinear;
+
+    NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
+    NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
+
+#ifdef SELF_CHECK_GPU
+    NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
+#endif
+
+    NCV_SKIP_COND_BEGIN
+
+    if (bDeviceCode)
+    {
+        ncvAssertReturn(bUse2Refs == false, NCV_NOT_IMPLEMENTED);
+
+        dim3 bDim(16, 8);
+        dim3 gDim(divUp(outRoi.width, bDim.x), divUp(outRoi.height, bDim.y));
+        kernelInterpolateFrom1<<<gDim, bDim, 0, cuStream>>>(lastLayer->ptr(),
+                                                            lastLayer->pitch(),
+                                                            lastLayer->size(),
+                                                            outImg.ptr(),
+                                                            outImg.pitch(),
+                                                            outRoi);
+        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
+
+#ifdef SELF_CHECK_GPU
+        ncvSafeMatAlloc(h_lastLayer, T, allocCPU, lastLayer->width(), lastLayer->height(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvSafeMatAlloc(h_outImg, T, allocCPU, outImg.width(), outImg.height(), NCV_ALLOCATOR_BAD_ALLOC);
+        ncvAssertReturnNcvStat(lastLayer->copy2D(h_lastLayer, lastLayer->size(), cuStream));
+        ncvAssertReturnNcvStat(outImg.copy2D(h_outImg, outRoi, cuStream));
+        ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
+
+        for (Ncv32u i=0; i<outRoi.height; i++)
+        {
+            for (Ncv32u j=0; j<outRoi.width; j++)
+            {
+                NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
+                Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
+                Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
+                T outGold = _interpBilinear(h_lastLayer, ptTopX, ptTopY);
+                ncvAssertPrintReturn(0 == memcmp(&outGold, &h_outImg.at(j,i), sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelInterpolateFrom1", NCV_UNKNOWN_ERROR);
+            }
+        }
+#endif
+    }
+    else
+    {
+        for (Ncv32u i=0; i<outRoi.height; i++)
+        {
+            for (Ncv32u j=0; j<outRoi.width; j++)
+            {
+                //top layer pixel (always exists)
+                NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
+                Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
+                Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
+                T topPix = _interpBilinear(*lastLayer, ptTopX, ptTopY);
+                T trilinearPix = topPix;
+
+                if (bUse2Refs)
+                {
+                    //bottom layer pixel (exists only if the requested scale is greater than the smallest layer scale)
+                    NcvSize32u szBottomLayer(curLayer->width(), curLayer->height());
+                    Ncv32f ptBottomX = 1.0f * (szBottomLayer.width - 1) * j / (outRoi.width - 1);
+                    Ncv32f ptBottomY = 1.0f * (szBottomLayer.height - 1) * i / (outRoi.height - 1);
+                    T bottomPix = _interpBilinear(*curLayer, ptBottomX, ptBottomY);
+
+                    Ncv32f scale = (1.0f * outRoi.width / layer0->width() + 1.0f * outRoi.height / layer0->height()) / 2;
+                    Ncv32f dl = (scale - curScale) / (lastScale - curScale);
+                    dl = CLAMP(dl, 0.0f, 1.0f);
+                    trilinearPix = _interpLinear(bottomPix, topPix, dl);
+                }
+
+                outImg.at(j, i) = trilinearPix;
+            }
+        }
+    }
+
+    NCV_SKIP_COND_END
+
+    return NCV_SUCCESS;
+}
+
+
+template class NCVImagePyramid<uchar1>;
+template class NCVImagePyramid<uchar3>;
+template class NCVImagePyramid<uchar4>;
+template class NCVImagePyramid<ushort1>;
+template class NCVImagePyramid<ushort3>;
+template class NCVImagePyramid<ushort4>;
+template class NCVImagePyramid<uint1>;
+template class NCVImagePyramid<uint3>;
+template class NCVImagePyramid<uint4>;
+template class NCVImagePyramid<float1>;
+template class NCVImagePyramid<float3>;
+template class NCVImagePyramid<float4>;
+
+#endif //_WIN32
--- a/modules/gpulegacy/src/cuda/NCVRuntimeTemplates.hpp
+++ b/modules/gpulegacy/src/cuda/NCVRuntimeTemplates.hpp
@@ -0,0 +1,221 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _ncvruntimetemplates_hpp_
+#define _ncvruntimetemplates_hpp_
+#if defined _MSC_VER &&_MSC_VER >= 1200
+#pragma warning( disable: 4800 )
+#endif
+
+
+#include <stdarg.h>
+#include <vector>
+
+
+////////////////////////////////////////////////////////////////////////////////
+// The Loki Library
+// Copyright (c) 2001 by Andrei Alexandrescu
+// This code accompanies the book:
+// Alexandrescu, Andrei. "Modern C++ Design: Generic Programming and Design
+//     Patterns Applied". Copyright (c) 2001. Addison-Wesley.
+// Permission to use, copy, modify, distribute and sell this software for any
+//     purpose is hereby granted without fee, provided that the above copyright
+//     notice appear in all copies and that both that copyright notice and this
+//     permission notice appear in supporting documentation.
+// The author or Addison-Welsey Longman make no representations about the
+//     suitability of this software for any purpose. It is provided "as is"
+//     without express or implied warranty.
+// http://loki-lib.sourceforge.net/index.php?n=Main.License
+////////////////////////////////////////////////////////////////////////////////
+
+namespace Loki
+{
+    //==============================================================================
+    // class NullType
+    // Used as a placeholder for "no type here"
+    // Useful as an end marker in typelists
+    //==============================================================================
+
+    class NullType {};
+
+    //==============================================================================
+    // class template Typelist
+    // The building block of typelists of any length
+    // Use it through the LOKI_TYPELIST_NN macros
+    // Defines nested types:
+    //     Head (first element, a non-typelist type by convention)
+    //     Tail (second element, can be another typelist)
+    //==============================================================================
+
+    template <class T, class U>
+    struct Typelist
+    {
+        typedef T Head;
+        typedef U Tail;
+    };
+
+    //==============================================================================
+    // class template Int2Type
+    // Converts each integral constant into a unique type
+    // Invocation: Int2Type<v> where v is a compile-time constant integral
+    // Defines 'value', an enum that evaluates to v
+    //==============================================================================
+
+    template <int v>
+    struct Int2Type
+    {
+        enum { value = v };
+    };
+
+    namespace TL
+    {
+        //==============================================================================
+        // class template TypeAt
+        // Finds the type at a given index in a typelist
+        // Invocation (TList is a typelist and index is a compile-time integral
+        //     constant):
+        // TypeAt<TList, index>::Result
+        // returns the type in position 'index' in TList
+        // If you pass an out-of-bounds index, the result is a compile-time error
+        //==============================================================================
+
+        template <class TList, unsigned int index> struct TypeAt;
+
+        template <class Head, class Tail>
+        struct TypeAt<Typelist<Head, Tail>, 0>
+        {
+            typedef Head Result;
+        };
+
+        template <class Head, class Tail, unsigned int i>
+        struct TypeAt<Typelist<Head, Tail>, i>
+        {
+            typedef typename TypeAt<Tail, i - 1>::Result Result;
+        };
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime boolean template instance dispatcher
+// Cyril Crassin <cyril.crassin@icare3d.org>
+// NVIDIA, 2010
+////////////////////////////////////////////////////////////////////////////////
+
+namespace NCVRuntimeTemplateBool
+{
+    //This struct is used to transform a list of parameters into template arguments
+    //The idea is to build a typelist containing the arguments
+    //and to pass this typelist to a user defined functor
+    template<typename TList, int NumArguments, class Func>
+    struct KernelCaller
+    {
+        //Convenience function used by the user
+        //Takes a variable argument list, transforms it into a list
+        static void call(Func *functor, ...)
+        {
+            //Vector used to collect arguments
+            std::vector<int> templateParamList;
+
+            //Variable argument list manipulation
+            va_list listPointer;
+            va_start(listPointer, functor);
+            //Collect parameters into the list
+            for(int i=0; i<NumArguments; i++)
+            {
+                int val = va_arg(listPointer, int);
+                templateParamList.push_back(val);
+            }
+            va_end(listPointer);
+
+            //Call the actual typelist building function
+            call(*functor, templateParamList);
+        }
+
+        //Actual function called recursively to build a typelist based
+        //on a list of values
+        static void call( Func &functor, std::vector<int> &templateParamList)
+        {
+            //Get current parameter value in the list
+            NcvBool val = templateParamList[templateParamList.size() - 1];
+            templateParamList.pop_back();
+
+            //Select the compile time value to add into the typelist
+            //depending on the runtime variable and make recursive call.
+            //Both versions are really instantiated
+            if (val)
+            {
+                KernelCaller<
+                    Loki::Typelist<typename Loki::Int2Type<1>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+            else
+            {
+                KernelCaller<
+                    Loki::Typelist<typename Loki::Int2Type<0>, TList >,
+                    NumArguments-1, Func >
+                    ::call(functor, templateParamList);
+            }
+        }
+    };
+
+    //Specialization for 0 value left in the list
+    //-> actual kernel functor call
+    template<class TList, class Func>
+    struct KernelCaller<TList, 0, Func>
+    {
+        static void call(Func &functor)
+        {
+            //Call to the functor's kernel call method
+            functor.call(TList()); //TList instantiated to get the method template parameter resolved
+        }
+
+        static void call(Func &functor, std::vector<int> &templateParams)
+        {
+            (void)templateParams;
+            functor.call(TList());
+        }
+    };
+}
+
+#endif //_ncvruntimetemplates_hpp_
--- a/modules/gpulegacy/src/cuda/NPP_staging.cu
+++ b/modules/gpulegacy/src/cuda/NPP_staging.cu
--- a/modules/gpulegacy/src/precomp.cpp
+++ b/modules/gpulegacy/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/gpulegacy/src/precomp.hpp
+++ b/modules/gpulegacy/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include <limits>
+#include <iostream>
+#include <algorithm>
+
+#include "opencv2/gpulegacy.hpp"
+#include "opencv2/core/utility.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_OBJDETECT
+#  include "opencv2/objdetect.hpp"
+#endif
+
+#include "opencv2/core/gpu_private.hpp"
+#include "opencv2/gpulegacy/private.hpp"
+
+#endif /* __OPENCV_PRECOMP_H__ */