Merge pull request #1575 from jet47:gpu-buffer-pool

2013-10-21 16:35:14 +04:00
parent 21233656bd 342e007dc6
commit 29f37fc130
9 changed files with 816 additions and 40 deletions
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -61,16 +61,30 @@ namespace cv { namespace cuda {
 class CV_EXPORTS GpuMat
 {
 public:
    class CV_EXPORTS Allocator
    {
    public:
        virtual ~Allocator() {}
        // allocator must fill data, step and refcount fields
        virtual bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize) = 0;
        virtual void free(GpuMat* mat) = 0;
    };
    //! default allocator
    static Allocator* defaultAllocator();
    static void setDefaultAllocator(Allocator* allocator);
    //! default constructor
-    GpuMat();
+    explicit GpuMat(Allocator* allocator = defaultAllocator());
    //! constructs GpuMat of the specified size and type
-    GpuMat(int rows, int cols, int type);
+    GpuMat(int rows, int cols, int type, Allocator* allocator = defaultAllocator());
-    GpuMat(Size size, int type);
+    GpuMat(Size size, int type, Allocator* allocator = defaultAllocator());
    //! constucts GpuMat and fills it with the specified value _s
-    GpuMat(int rows, int cols, int type, Scalar s);
+    GpuMat(int rows, int cols, int type, Scalar s, Allocator* allocator = defaultAllocator());
-    GpuMat(Size size, int type, Scalar s);
+    GpuMat(Size size, int type, Scalar s, Allocator* allocator = defaultAllocator());
    //! copy constructor
    GpuMat(const GpuMat& m);
@@ -84,7 +98,7 @@ public:
    GpuMat(const GpuMat& m, Rect roi);
    //! builds GpuMat from host memory (Blocking call)
-    explicit GpuMat(InputArray arr);
+    explicit GpuMat(InputArray arr, Allocator* allocator = defaultAllocator());
    //! destructor - calls release()
    ~GpuMat();
@@ -249,6 +263,9 @@ public:
    //! helper fields used in locateROI and adjustROI
    uchar* datastart;
    uchar* dataend;
    //! allocator
    Allocator* allocator;
 };
 //! creates continuous matrix
@@ -260,6 +277,10 @@ CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, OutputArray arr
 CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat);
 //! BufferPool management (must be called before Stream creation)
 CV_EXPORTS void setBufferPoolUsage(bool on);
 CV_EXPORTS void setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount);
 //////////////////////////////// CudaMem ////////////////////////////////
 // CudaMem is limited cv::Mat with page locked memory allocation.
@@ -382,6 +403,7 @@ private:
    Stream(const Ptr<Impl>& impl);
    friend struct StreamAccessor;
    friend class BufferPool;
 };
 class CV_EXPORTS Event
--- a/modules/core/include/opencv2/core/cuda.inl.hpp
+++ b/modules/core/include/opencv2/core/cuda.inl.hpp
@@ -51,29 +51,29 @@ namespace cv { namespace cuda {
 //////////////////////////////// GpuMat ///////////////////////////////
 inline
-GpuMat::GpuMat()
+GpuMat::GpuMat(Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
 {}
 inline
-GpuMat::GpuMat(int rows_, int cols_, int type_)
+GpuMat::GpuMat(int rows_, int cols_, int type_, Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
 {
    if (rows_ > 0 && cols_ > 0)
        create(rows_, cols_, type_);
 }
 inline
-GpuMat::GpuMat(Size size_, int type_)
+GpuMat::GpuMat(Size size_, int type_, Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
 {
    if (size_.height > 0 && size_.width > 0)
        create(size_.height, size_.width, type_);
 }
 inline
-GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_)
+GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_, Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
 {
    if (rows_ > 0 && cols_ > 0)
    {
@@ -83,8 +83,8 @@ GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_)
 }
 inline
-GpuMat::GpuMat(Size size_, int type_, Scalar s_)
+GpuMat::GpuMat(Size size_, int type_, Scalar s_, Allocator* allocator_)
-    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
 {
    if (size_.height > 0 && size_.width > 0)
    {
@@ -95,15 +95,15 @@ GpuMat::GpuMat(Size size_, int type_, Scalar s_)
 inline
 GpuMat::GpuMat(const GpuMat& m)
-    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), allocator(m.allocator)
 {
    if (refcount)
        CV_XADD(refcount, 1);
 }
 inline
-GpuMat::GpuMat(InputArray arr) :
+GpuMat::GpuMat(InputArray arr, Allocator* allocator_) :
-    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), allocator(allocator_)
 {
    upload(arr);
 }
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -90,6 +90,38 @@ static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The
 namespace cv { namespace cuda
 {
    class MemoryStack;
    class CV_EXPORTS StackAllocator : public GpuMat::Allocator
    {
    public:
        explicit StackAllocator(cudaStream_t stream);
        ~StackAllocator();
        bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
        void free(GpuMat* mat);
    private:
        StackAllocator(const StackAllocator&);
        StackAllocator& operator =(const StackAllocator&);
        cudaStream_t stream_;
        MemoryStack* memStack_;
        size_t alignment_;
    };
    class CV_EXPORTS BufferPool
    {
    public:
        explicit BufferPool(Stream& stream);
        GpuMat getBuffer(int rows, int cols, int type);
        GpuMat getBuffer(Size size, int type) { return getBuffer(size.height, size.width, type); }
    private:
        GpuMat::Allocator* allocator_;
    };
    static inline void checkNppError(int code, const char* file, const int line, const char* func)
    {
        if (code < 0)
--- a/modules/core/src/cuda/gpu_mat.cu
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -55,6 +55,54 @@ using namespace cv;
 using namespace cv::cuda;
 using namespace cv::cudev;
 namespace
 {
    class DefaultAllocator : public GpuMat::Allocator
    {
    public:
        bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
        void free(GpuMat* mat);
    };
    bool DefaultAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
    {
        if (rows > 1 && cols > 1)
        {
            CV_CUDEV_SAFE_CALL( cudaMallocPitch(&mat->data, &mat->step, elemSize * cols, rows) );
        }
        else
        {
            // Single row or single column must be continuous
            CV_CUDEV_SAFE_CALL( cudaMalloc(&mat->data, elemSize * cols * rows) );
            mat->step = elemSize * cols;
        }
        mat->refcount = (int*) fastMalloc(sizeof(int));
        return true;
    }
    void DefaultAllocator::free(GpuMat* mat)
    {
        cudaFree(mat->datastart);
        fastFree(mat->refcount);
    }
    DefaultAllocator cudaDefaultAllocator;
    GpuMat::Allocator* g_defaultAllocator = &cudaDefaultAllocator;
 }
 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
 {
    return g_defaultAllocator;
 }
 void cv::cuda::GpuMat::setDefaultAllocator(Allocator* allocator)
 {
    CV_Assert( allocator != 0 );
    g_defaultAllocator = allocator;
 }
 /////////////////////////////////////////////////////
 /// create
@@ -76,19 +124,16 @@ void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
        rows = _rows;
        cols = _cols;
-        size_t esz = elemSize();
+        const size_t esz = elemSize();
-        void* devPtr;
+        bool allocSuccess = allocator->allocate(this, rows, cols, esz);
-        if (rows > 1 && cols > 1)
+        if (!allocSuccess)
        {
-            CV_CUDEV_SAFE_CALL( cudaMallocPitch(&devPtr, &step, esz * cols, rows) );
+            // custom allocator fails, try default allocator
-        }
+            allocator = defaultAllocator();
-        else
+            allocSuccess = allocator->allocate(this, rows, cols, esz);
-        {
+            CV_Assert( allocSuccess );
            // Single row or single column must be continuous
            CV_CUDEV_SAFE_CALL( cudaMalloc(&devPtr, esz * cols * rows) );
            step = esz * cols;
        }
        if (esz * cols == step)
@@ -97,11 +142,11 @@ void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
        int64 _nettosize = static_cast<int64>(step) * rows;
        size_t nettosize = static_cast<size_t>(_nettosize);
-        datastart = data = static_cast<uchar*>(devPtr);
+        datastart = data;
        dataend = data + nettosize;
-        refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));
+        if (refcount)
-        *refcount = 1;
+            *refcount = 1;
    }
 }
@@ -110,11 +155,10 @@ void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
 void cv::cuda::GpuMat::release()
 {
    CV_DbgAssert( allocator != 0 );
    if (refcount && CV_XADD(refcount, -1) == 1)
-    {
+        allocator->free(this);
        cudaFree(datastart);
        fastFree(refcount);
    }
    data = datastart = dataend = 0;
    step = rows = cols = 0;
--- a/modules/core/src/cuda_buffer_pool.cpp
+++ b/modules/core/src/cuda_buffer_pool.cpp
@@ -0,0 +1,418 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                          License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "precomp.hpp"
 using namespace cv;
 using namespace cv::cuda;
 #ifdef HAVE_CUDA
 #include "opencv2/cudev/common.hpp"
 /////////////////////////////////////////////////////////////
 /// MemoryStack
 namespace
 {
    class MemoryPool;
 }
 class cv::cuda::MemoryStack
 {
 public:
    uchar* requestMemory(size_t size);
    void returnMemory(uchar* ptr);
    uchar* datastart;
    uchar* dataend;
    uchar* tip;
    bool isFree;
    MemoryPool* pool;
 #if defined(DEBUG) || defined(_DEBUG)
    std::vector<size_t> allocations;
 #endif
 };
 uchar* cv::cuda::MemoryStack::requestMemory(size_t size)
 {
    const size_t freeMem = dataend - tip;
    if (size > freeMem)
        return 0;
    uchar* ptr = tip;
    tip += size;
 #if defined(DEBUG) || defined(_DEBUG)
    allocations.push_back(size);
 #endif
    return ptr;
 }
 void cv::cuda::MemoryStack::returnMemory(uchar* ptr)
 {
    CV_DbgAssert( ptr >= datastart && ptr < dataend );
 #if defined(DEBUG) || defined(_DEBUG)
    const size_t allocSize = tip - ptr;
    CV_Assert( allocSize == allocations.back() );
    allocations.pop_back();
 #endif
    tip = ptr;
 }
 /////////////////////////////////////////////////////////////
 /// MemoryPool
 namespace
 {
    class MemoryPool
    {
    public:
        MemoryPool();
        void initialize(size_t stackSize, int stackCount);
        void release();
        MemoryStack* getFreeMemStack();
        void returnMemStack(MemoryStack* memStack);
    private:
        void initilizeImpl();
        Mutex mtx_;
        bool initialized_;
        size_t stackSize_;
        int stackCount_;
        uchar* mem_;
        std::vector<MemoryStack> stacks_;
    };
    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
    {
        // default : 10 Mb, 5 stacks
        stackSize_ = 10 * 1024 * 1024;
        stackCount_ = 5;
    }
    void MemoryPool::initialize(size_t stackSize, int stackCount)
    {
        AutoLock lock(mtx_);
        release();
        stackSize_ = stackSize;
        stackCount_ = stackCount;
        initilizeImpl();
    }
    void MemoryPool::initilizeImpl()
    {
        const size_t totalSize = stackSize_ * stackCount_;
        if (totalSize > 0)
        {
            cudaError_t err = cudaMalloc(&mem_, totalSize);
            if (err != cudaSuccess)
                return;
            stacks_.resize(stackCount_);
            uchar* ptr = mem_;
            for (int i = 0; i < stackCount_; ++i)
            {
                stacks_[i].datastart = ptr;
                stacks_[i].dataend = ptr + stackSize_;
                stacks_[i].tip = ptr;
                stacks_[i].isFree = true;
                stacks_[i].pool = this;
                ptr += stackSize_;
            }
            initialized_ = true;
        }
    }
    void MemoryPool::release()
    {
        if (mem_)
        {
 #if defined(DEBUG) || defined(_DEBUG)
            for (int i = 0; i < stackCount_; ++i)
            {
                CV_DbgAssert( stacks_[i].isFree );
                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
            }
 #endif
            cudaFree( mem_ );
            mem_ = 0;
            initialized_ = false;
        }
    }
    MemoryStack* MemoryPool::getFreeMemStack()
    {
        AutoLock lock(mtx_);
        if (!initialized_)
            initilizeImpl();
        if (!mem_)
            return 0;
        for (int i = 0; i < stackCount_; ++i)
        {
            if (stacks_[i].isFree)
            {
                stacks_[i].isFree = false;
                return &stacks_[i];
            }
        }
        return 0;
    }
    void MemoryPool::returnMemStack(MemoryStack* memStack)
    {
        AutoLock lock(mtx_);
        CV_DbgAssert( !memStack->isFree );
 #if defined(DEBUG) || defined(_DEBUG)
        bool found = false;
        for (int i = 0; i < stackCount_; ++i)
        {
            if (memStack == &stacks_[i])
            {
                found = true;
                break;
            }
        }
        CV_DbgAssert( found );
 #endif
        CV_DbgAssert( memStack->tip == memStack->datastart );
        memStack->isFree = true;
    }
 }
 /////////////////////////////////////////////////////////////
 /// MemoryPoolManager
 namespace
 {
    class MemoryPoolManager
    {
    public:
        MemoryPoolManager();
        ~MemoryPoolManager();
        MemoryPool* getPool(int deviceId);
    private:
        std::vector<MemoryPool> pools_;
    };
    MemoryPoolManager::MemoryPoolManager()
    {
        int deviceCount = getCudaEnabledDeviceCount();
        if (deviceCount > 0)
            pools_.resize(deviceCount);
    }
    MemoryPoolManager::~MemoryPoolManager()
    {
        for (size_t i = 0; i < pools_.size(); ++i)
        {
            cudaSetDevice(i);
            pools_[i].release();
        }
    }
    MemoryPool* MemoryPoolManager::getPool(int deviceId)
    {
        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
        return &pools_[deviceId];
    }
    MemoryPool* memPool(int deviceId)
    {
        static MemoryPoolManager manager;
        return manager.getPool(deviceId);
    }
 }
 /////////////////////////////////////////////////////////////
 /// StackAllocator
 namespace
 {
    bool enableMemoryPool = true;
 }
 cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
 {
    if (enableMemoryPool)
    {
        const int deviceId = getDevice();
        memStack_ = memPool(deviceId)->getFreeMemStack();
        DeviceInfo devInfo(deviceId);
        alignment_ = devInfo.textureAlignment();
    }
 }
 cv::cuda::StackAllocator::~StackAllocator()
 {
    cudaStreamSynchronize(stream_);
    if (memStack_ != 0)
        memStack_->pool->returnMemStack(memStack_);
 }
 namespace
 {
    size_t alignUp(size_t what, size_t alignment)
    {
        size_t alignMask = alignment-1;
        size_t inverseAlignMask = ~alignMask;
        size_t res = (what + alignMask) & inverseAlignMask;
        return res;
    }
 }
 bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
 {
    if (memStack_ == 0)
        return false;
    size_t pitch, memSize;
    if (rows > 1 && cols > 1)
    {
        pitch = alignUp(cols * elemSize, alignment_);
        memSize = pitch * rows;
    }
    else
    {
        // Single row or single column must be continuous
        pitch = elemSize * cols;
        memSize = alignUp(elemSize * cols * rows, 64);
    }
    uchar* ptr = memStack_->requestMemory(memSize);
    if (ptr == 0)
        return false;
    mat->data = ptr;
    mat->step = pitch;
    mat->refcount = (int*) fastMalloc(sizeof(int));
    return true;
 }
 void cv::cuda::StackAllocator::free(GpuMat* mat)
 {
    if (memStack_ == 0)
        return;
    memStack_->returnMemory(mat->datastart);
    fastFree(mat->refcount);
 }
 void cv::cuda::setBufferPoolUsage(bool on)
 {
    enableMemoryPool = on;
 }
 void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
 {
    const int currentDevice = getDevice();
    if (deviceId >= 0)
    {
        setDevice(deviceId);
        memPool(deviceId)->initialize(stackSize, stackCount);
    }
    else
    {
        const int deviceCount = getCudaEnabledDeviceCount();
        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
        {
            setDevice(deviceId);
            memPool(deviceId)->initialize(stackSize, stackCount);
        }
    }
    setDevice(currentDevice);
 }
 /////////////////////////////////////////////////////////////
 /// BufferPool
 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
 {
    GpuMat buf(allocator_);
    buf.create(rows, cols, type);
    return buf;
 }
 #endif
--- a/modules/core/src/cuda_gpu_mat.cpp
+++ b/modules/core/src/cuda_gpu_mat.cpp
@@ -49,7 +49,8 @@ using namespace cv::cuda;
 cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
    flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_),
    step(step_), data((uchar*)data_), refcount(0),
-    datastart((uchar*)data_), dataend((uchar*)data_)
+    datastart((uchar*)data_), dataend((uchar*)data_),
    allocator(defaultAllocator())
 {
    size_t minstep = cols * elemSize();
@@ -74,7 +75,8 @@ cv::cuda::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t st
 cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
    flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(size_.height), cols(size_.width),
    step(step_), data((uchar*)data_), refcount(0),
-    datastart((uchar*)data_), dataend((uchar*)data_)
+    datastart((uchar*)data_), dataend((uchar*)data_),
    allocator(defaultAllocator())
 {
    size_t minstep = cols * elemSize();
@@ -92,6 +94,7 @@ cv::cuda::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) :
        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
    }
    dataend += step * (rows - 1) + minstep;
 }
@@ -100,6 +103,7 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
    flags = m.flags;
    step = m.step; refcount = m.refcount;
    data = m.data; datastart = m.datastart; dataend = m.dataend;
    allocator = m.allocator;
    if (rowRange_ == Range::all())
    {
@@ -139,7 +143,8 @@ cv::cuda::GpuMat::GpuMat(const GpuMat& m, Range rowRange_, Range colRange_)
 cv::cuda::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
    flags(m.flags), rows(roi.height), cols(roi.width),
    step(m.step), data(m.data + roi.y*step), refcount(m.refcount),
-    datastart(m.datastart), dataend(m.dataend)
+    datastart(m.datastart), dataend(m.dataend),
    allocator(m.allocator)
 {
    flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
    data += roi.x * elemSize();
@@ -347,6 +352,17 @@ GpuMat cv::cuda::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
 #ifndef HAVE_CUDA
 GpuMat::Allocator* cv::cuda::GpuMat::defaultAllocator()
 {
    return 0;
 }
 void cv::cuda::GpuMat::setDefaultAllocator(Allocator* allocator)
 {
    (void) allocator;
    throw_no_cuda();
 }
 void cv::cuda::GpuMat::create(int _rows, int _cols, int _type)
 {
    (void) _rows;
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@@ -66,6 +66,7 @@ class cv::cuda::Stream::Impl
 {
 public:
    cudaStream_t stream;
    Ptr<StackAllocator> stackAllocator_;
    Impl();
    Impl(cudaStream_t stream);
@@ -73,17 +74,26 @@ public:
    ~Impl();
 };
 cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
 {
 }
 cv::cuda::Stream::Impl::Impl() : stream(0)
 {
    cudaSafeCall( cudaStreamCreate(&stream) );
    stackAllocator_ = makePtr<StackAllocator>(stream);
 }
 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
 {
    stackAllocator_ = makePtr<StackAllocator>(stream);
 }
 cv::cuda::Stream::Impl::~Impl()
 {
    stackAllocator_.release();
    if (stream)
        cudaStreamDestroy(stream);
 }
@@ -197,7 +207,7 @@ cv::cuda::Stream::operator bool_type() const
 ////////////////////////////////////////////////////////////////
-// Stream
+// Event
 #ifndef HAVE_CUDA
--- a/modules/cuda/perf/perf_buffer_pool.cpp
+++ b/modules/cuda/perf/perf_buffer_pool.cpp
@@ -0,0 +1,114 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "perf_precomp.hpp"
 #ifdef HAVE_CUDA
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/core/private.cuda.hpp"
 using namespace testing;
 using namespace perf;
 using namespace cv;
 using namespace cv::cuda;
 namespace
 {
    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
    {
        BufferPool pool(stream);
        GpuMat buf = pool.getBuffer(src.size(), CV_32FC(src.channels()));
        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
        cuda::exp(buf, dst, stream);
    }
    void func2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
    {
        BufferPool pool(stream);
        GpuMat buf1 = pool.getBuffer(src1.size(), CV_32FC(src1.channels()));
        func1(src1, buf1, stream);
        GpuMat buf2 = pool.getBuffer(src2.size(), CV_32FC(src2.channels()));
        func1(src2, buf2, stream);
        cuda::add(buf1, buf2, dst, noArray(), -1, stream);
    }
 }
 PERF_TEST_P(Sz, BufferPool, CUDA_TYPICAL_MAT_SIZES)
 {
    static bool first = true;
    const Size size = GetParam();
    const bool useBufferPool = PERF_RUN_CUDA();
    Mat host_src(size, CV_8UC1);
    declare.in(host_src, WARMUP_RNG);
    GpuMat src1(host_src), src2(host_src);
    GpuMat dst;
    setBufferPoolUsage(useBufferPool);
    if (useBufferPool && first)
    {
        setBufferPoolConfig(-1, 25 * 1024 * 1024, 2);
        first = false;
    }
    TEST_CYCLE()
    {
        func2(src1, src2, dst, Stream::Null());
    }
    Mat h_dst(dst);
    SANITY_CHECK(h_dst);
 }
 #endif
--- a/modules/cuda/test/test_buffer_pool.cpp
+++ b/modules/cuda/test/test_buffer_pool.cpp
@@ -0,0 +1,120 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #include "test_precomp.hpp"
 #ifdef HAVE_CUDA
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudawarping.hpp"
 #include "opencv2/core/private.cuda.hpp"
 using namespace testing;
 using namespace cv;
 using namespace cv::cuda;
 struct BufferPoolTest : TestWithParam<DeviceInfo>
 {
 };
 namespace
 {
    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
    {
        BufferPool pool(stream);
        GpuMat buf = pool.getBuffer(src.size(), CV_32FC(src.channels()));
        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
        cuda::exp(buf, dst, stream);
    }
    void func2(const GpuMat& src, GpuMat& dst, Stream& stream)
    {
        BufferPool pool(stream);
        GpuMat buf1 = pool.getBuffer(saturate_cast<int>(src.rows * 0.5), saturate_cast<int>(src.cols * 0.5), src.type());
        cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST, stream);
        GpuMat buf2 = pool.getBuffer(buf1.size(), CV_32FC(buf1.channels()));
        func1(buf1, buf2, stream);
        GpuMat buf3 = pool.getBuffer(src.size(), buf2.type());
        cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST, stream);
        buf3.convertTo(dst, CV_8U, stream);
    }
 }
 CUDA_TEST_P(BufferPoolTest, SimpleUsage)
 {
    DeviceInfo devInfo = GetParam();
    setDevice(devInfo.deviceID());
    GpuMat src(200, 200, CV_8UC1);
    GpuMat dst;
    Stream stream;
    func2(src, dst, stream);
    stream.waitForCompletion();
    GpuMat buf, buf1, buf2, buf3;
    GpuMat dst_gold;
    cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST);
    buf1.convertTo(buf, CV_32F, 1.0 / 255.0);
    cuda::exp(buf, buf2);
    cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST);
    buf3.convertTo(dst_gold, CV_8U);
    ASSERT_MAT_NEAR(dst_gold, dst, 0);
 }
 INSTANTIATE_TEST_CASE_P(CUDA_Stream, BufferPoolTest, ALL_DEVICES);
 #endif // HAVE_CUDA