From 7ed38b97c3726c14155865267b054d2ad6049f41 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Wed, 17 Dec 2014 18:51:15 +0300
Subject: [PATCH] fix cuda::BufferPool deinitialization

The deinitialization of BufferPool internal objects is controled by global
object, but it depends on other global objects, which leads to errors
caused by undefined deinitialization order of global objects.

I merge global objects initialization into single class, which performs
initialization and deinitialization in correct order.
---
 modules/core/include/opencv2/core/cuda.hpp |   1 +
 modules/core/src/cuda_buffer_pool.cpp      | 435 ------------------
 modules/core/src/cuda_stream.cpp           | 488 ++++++++++++++++++++-
 3 files changed, 466 insertions(+), 458 deletions(-)
 delete mode 100644 modules/core/src/cuda_buffer_pool.cpp

diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp
index 15d526e80..8e0944061 100644
--- a/modules/core/include/opencv2/core/cuda.hpp
+++ b/modules/core/include/opencv2/core/cuda.hpp
@@ -479,6 +479,7 @@ private:
 
     friend struct StreamAccessor;
     friend class BufferPool;
+    friend class DefaultDeviceInitializer;
 };
 
 class CV_EXPORTS Event
diff --git a/modules/core/src/cuda_buffer_pool.cpp b/modules/core/src/cuda_buffer_pool.cpp
deleted file mode 100644
index e5caf6ef2..000000000
--- a/modules/core/src/cuda_buffer_pool.cpp
+++ /dev/null
@@ -1,435 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::cuda;
-
-#ifdef HAVE_CUDA
-
-#include "opencv2/cudev/common.hpp"
-
-/////////////////////////////////////////////////////////////
-/// MemoryStack
-
-namespace
-{
-    class MemoryPool;
-}
-
-class cv::cuda::MemoryStack
-{
-public:
-    uchar* requestMemory(size_t size);
-    void returnMemory(uchar* ptr);
-
-    uchar* datastart;
-    uchar* dataend;
-    uchar* tip;
-
-    bool isFree;
-    MemoryPool* pool;
-
-#if defined(DEBUG) || defined(_DEBUG)
-    std::vector<size_t> allocations;
-#endif
-};
-
-uchar* cv::cuda::MemoryStack::requestMemory(size_t size)
-{
-    const size_t freeMem = dataend - tip;
-
-    if (size > freeMem)
-        return 0;
-
-    uchar* ptr = tip;
-
-    tip += size;
-
-#if defined(DEBUG) || defined(_DEBUG)
-    allocations.push_back(size);
-#endif
-
-    return ptr;
-}
-
-void cv::cuda::MemoryStack::returnMemory(uchar* ptr)
-{
-    CV_DbgAssert( ptr >= datastart && ptr < dataend );
-
-#if defined(DEBUG) || defined(_DEBUG)
-    const size_t allocSize = tip - ptr;
-    CV_Assert( allocSize == allocations.back() );
-    allocations.pop_back();
-#endif
-
-    tip = ptr;
-}
-
-/////////////////////////////////////////////////////////////
-/// MemoryPool
-
-namespace
-{
-    class MemoryPool
-    {
-    public:
-        MemoryPool();
-
-        void initialize(size_t stackSize, int stackCount);
-        void release();
-
-        MemoryStack* getFreeMemStack();
-        void returnMemStack(MemoryStack* memStack);
-
-    private:
-        void initilizeImpl();
-
-        Mutex mtx_;
-
-        bool initialized_;
-        size_t stackSize_;
-        int stackCount_;
-
-        uchar* mem_;
-
-        std::vector<MemoryStack> stacks_;
-    };
-
-    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
-    {
-        // default : 10 Mb, 5 stacks
-        stackSize_ = 10 * 1024 * 1024;
-        stackCount_ = 5;
-    }
-
-    void MemoryPool::initialize(size_t stackSize, int stackCount)
-    {
-        AutoLock lock(mtx_);
-
-        release();
-
-        stackSize_ = stackSize;
-        stackCount_ = stackCount;
-
-        initilizeImpl();
-    }
-
-    void MemoryPool::initilizeImpl()
-    {
-        const size_t totalSize = stackSize_ * stackCount_;
-
-        if (totalSize > 0)
-        {
-            cudaError_t err = cudaMalloc(&mem_, totalSize);
-            if (err != cudaSuccess)
-                return;
-
-            stacks_.resize(stackCount_);
-
-            uchar* ptr = mem_;
-
-            for (int i = 0; i < stackCount_; ++i)
-            {
-                stacks_[i].datastart = ptr;
-                stacks_[i].dataend = ptr + stackSize_;
-                stacks_[i].tip = ptr;
-                stacks_[i].isFree = true;
-                stacks_[i].pool = this;
-
-                ptr += stackSize_;
-            }
-
-            initialized_ = true;
-        }
-    }
-
-    void MemoryPool::release()
-    {
-        if (mem_)
-        {
-#if defined(DEBUG) || defined(_DEBUG)
-            for (int i = 0; i < stackCount_; ++i)
-            {
-                CV_DbgAssert( stacks_[i].isFree );
-                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
-            }
-#endif
-
-            cudaFree( mem_ );
-
-            mem_ = 0;
-            initialized_ = false;
-        }
-    }
-
-    MemoryStack* MemoryPool::getFreeMemStack()
-    {
-        AutoLock lock(mtx_);
-        if (!initialized_)
-            initilizeImpl();
-
-        if (!mem_)
-            return 0;
-
-        for (int i = 0; i < stackCount_; ++i)
-        {
-            if (stacks_[i].isFree)
-            {
-                stacks_[i].isFree = false;
-                return &stacks_[i];
-            }
-        }
-
-        return 0;
-    }
-
-    void MemoryPool::returnMemStack(MemoryStack* memStack)
-    {
-        AutoLock lock(mtx_);
-
-        CV_DbgAssert( !memStack->isFree );
-
-#if defined(DEBUG) || defined(_DEBUG)
-        bool found = false;
-        for (int i = 0; i < stackCount_; ++i)
-        {
-            if (memStack == &stacks_[i])
-            {
-                found = true;
-                break;
-            }
-        }
-        CV_DbgAssert( found );
-#endif
-
-        CV_DbgAssert( memStack->tip == memStack->datastart );
-
-        memStack->isFree = true;
-    }
-}
-
-/////////////////////////////////////////////////////////////
-/// MemoryPoolManager
-
-namespace
-{
-    Mutex mtx_;
-    bool memory_pool_manager_initialized;
-
-    class MemoryPoolManager
-    {
-    public:
-        MemoryPoolManager();
-        ~MemoryPoolManager();
-        void Init();
-
-        MemoryPool* getPool(int deviceId);
-
-    private:
-        std::vector<MemoryPool> pools_;
-    } manager;
-
-    //MemoryPoolManager ;
-
-    MemoryPoolManager::MemoryPoolManager()
-    {
-    }
-
-    void MemoryPoolManager::Init()
-    {
-        int deviceCount = getCudaEnabledDeviceCount();
-        if (deviceCount > 0)
-            pools_.resize(deviceCount);
-    }
-
-    MemoryPoolManager::~MemoryPoolManager()
-    {
-        for (size_t i = 0; i < pools_.size(); ++i)
-        {
-            cudaSetDevice(static_cast<int>(i));
-            pools_[i].release();
-        }
-    }
-
-    MemoryPool* MemoryPoolManager::getPool(int deviceId)
-    {
-        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
-        return &pools_[deviceId];
-    }
-
-    MemoryPool* memPool(int deviceId)
-    {
-        {
-            AutoLock lock(mtx_);
-            if (!memory_pool_manager_initialized)
-            {
-                memory_pool_manager_initialized = true;
-                manager.Init();
-            }
-        }
-        return manager.getPool(deviceId);
-    }
-}
-
-/////////////////////////////////////////////////////////////
-/// StackAllocator
-
-namespace
-{
-    bool enableMemoryPool = true;
-}
-
-cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
-{
-    if (enableMemoryPool)
-    {
-        const int deviceId = getDevice();
-        {
-            AutoLock lock(mtx_);
-            memStack_ = memPool(deviceId)->getFreeMemStack();
-        }
-        DeviceInfo devInfo(deviceId);
-        alignment_ = devInfo.textureAlignment();
-    }
-}
-
-cv::cuda::StackAllocator::~StackAllocator()
-{
-    cudaStreamSynchronize(stream_);
-
-    if (memStack_ != 0)
-        memStack_->pool->returnMemStack(memStack_);
-}
-
-namespace
-{
-    size_t alignUp(size_t what, size_t alignment)
-    {
-        size_t alignMask = alignment-1;
-        size_t inverseAlignMask = ~alignMask;
-        size_t res = (what + alignMask) & inverseAlignMask;
-        return res;
-    }
-}
-
-bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
-{
-    if (memStack_ == 0)
-        return false;
-
-    size_t pitch, memSize;
-
-    if (rows > 1 && cols > 1)
-    {
-        pitch = alignUp(cols * elemSize, alignment_);
-        memSize = pitch * rows;
-    }
-    else
-    {
-        // Single row or single column must be continuous
-        pitch = elemSize * cols;
-        memSize = alignUp(elemSize * cols * rows, 64);
-    }
-
-    uchar* ptr = memStack_->requestMemory(memSize);
-
-    if (ptr == 0)
-        return false;
-
-    mat->data = ptr;
-    mat->step = pitch;
-    mat->refcount = (int*) fastMalloc(sizeof(int));
-
-    return true;
-}
-
-void cv::cuda::StackAllocator::free(GpuMat* mat)
-{
-    if (memStack_ == 0)
-        return;
-
-    memStack_->returnMemory(mat->datastart);
-    fastFree(mat->refcount);
-}
-
-void cv::cuda::setBufferPoolUsage(bool on)
-{
-    enableMemoryPool = on;
-}
-
-void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
-{
-    const int currentDevice = getDevice();
-
-    if (deviceId >= 0)
-    {
-        setDevice(deviceId);
-        memPool(deviceId)->initialize(stackSize, stackCount);
-    }
-    else
-    {
-        const int deviceCount = getCudaEnabledDeviceCount();
-
-        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
-        {
-            setDevice(deviceId);
-            memPool(deviceId)->initialize(stackSize, stackCount);
-        }
-    }
-
-    setDevice(currentDevice);
-}
-
-/////////////////////////////////////////////////////////////
-/// BufferPool
-
-GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
-{
-    GpuMat buf(allocator_);
-    buf.create(rows, cols, type);
-    return buf;
-}
-
-#endif
diff --git a/modules/core/src/cuda_stream.cpp b/modules/core/src/cuda_stream.cpp
index 98a29df19..efcf9cb3e 100644
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@@ -45,8 +45,217 @@
 using namespace cv;
 using namespace cv::cuda;
 
+/////////////////////////////////////////////////////////////
+/// MemoryStack
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    class MemoryPool;
+}
+
+class cv::cuda::MemoryStack
+{
+public:
+    uchar* requestMemory(size_t size);
+    void returnMemory(uchar* ptr);
+
+    uchar* datastart;
+    uchar* dataend;
+    uchar* tip;
+
+    bool isFree;
+    MemoryPool* pool;
+
+#if !defined(NDEBUG)
+    std::vector<size_t> allocations;
+#endif
+};
+
+uchar* cv::cuda::MemoryStack::requestMemory(size_t size)
+{
+    const size_t freeMem = dataend - tip;
+
+    if (size > freeMem)
+        return 0;
+
+    uchar* ptr = tip;
+
+    tip += size;
+
+#if !defined(NDEBUG)
+    allocations.push_back(size);
+#endif
+
+    return ptr;
+}
+
+void cv::cuda::MemoryStack::returnMemory(uchar* ptr)
+{
+    CV_DbgAssert( ptr >= datastart && ptr < dataend );
+
+#if !defined(NDEBUG)
+    const size_t allocSize = tip - ptr;
+    CV_Assert( allocSize == allocations.back() );
+    allocations.pop_back();
+#endif
+
+    tip = ptr;
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////
+/// MemoryPool
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    class MemoryPool
+    {
+    public:
+        MemoryPool();
+
+        void initialize(size_t stackSize, int stackCount);
+        void release();
+
+        MemoryStack* getFreeMemStack();
+        void returnMemStack(MemoryStack* memStack);
+
+    private:
+        void initilizeImpl();
+
+        Mutex mtx_;
+
+        bool initialized_;
+        size_t stackSize_;
+        int stackCount_;
+
+        uchar* mem_;
+
+        std::vector<MemoryStack> stacks_;
+    };
+
+    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
+    {
+        // default : 10 Mb, 5 stacks
+        stackSize_ = 10 * 1024 * 1024;
+        stackCount_ = 5;
+    }
+
+    void MemoryPool::initialize(size_t stackSize, int stackCount)
+    {
+        AutoLock lock(mtx_);
+
+        release();
+
+        stackSize_ = stackSize;
+        stackCount_ = stackCount;
+
+        initilizeImpl();
+    }
+
+    void MemoryPool::initilizeImpl()
+    {
+        const size_t totalSize = stackSize_ * stackCount_;
+
+        if (totalSize > 0)
+        {
+            cudaError_t err = cudaMalloc(&mem_, totalSize);
+            if (err != cudaSuccess)
+                return;
+
+            stacks_.resize(stackCount_);
+
+            uchar* ptr = mem_;
+
+            for (int i = 0; i < stackCount_; ++i)
+            {
+                stacks_[i].datastart = ptr;
+                stacks_[i].dataend = ptr + stackSize_;
+                stacks_[i].tip = ptr;
+                stacks_[i].isFree = true;
+                stacks_[i].pool = this;
+
+                ptr += stackSize_;
+            }
+
+            initialized_ = true;
+        }
+    }
+
+    void MemoryPool::release()
+    {
+        if (mem_)
+        {
+#if !defined(NDEBUG)
+            for (int i = 0; i < stackCount_; ++i)
+            {
+                CV_DbgAssert( stacks_[i].isFree );
+                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
+            }
+#endif
+
+            cudaFree(mem_);
+
+            mem_ = 0;
+            initialized_ = false;
+        }
+    }
+
+    MemoryStack* MemoryPool::getFreeMemStack()
+    {
+        AutoLock lock(mtx_);
+
+        if (!initialized_)
+            initilizeImpl();
+
+        if (!mem_)
+            return 0;
+
+        for (int i = 0; i < stackCount_; ++i)
+        {
+            if (stacks_[i].isFree)
+            {
+                stacks_[i].isFree = false;
+                return &stacks_[i];
+            }
+        }
+
+        return 0;
+    }
+
+    void MemoryPool::returnMemStack(MemoryStack* memStack)
+    {
+        AutoLock lock(mtx_);
+
+        CV_DbgAssert( !memStack->isFree );
+
+#if !defined(NDEBUG)
+        bool found = false;
+        for (int i = 0; i < stackCount_; ++i)
+        {
+            if (memStack == &stacks_[i])
+            {
+                found = true;
+                break;
+            }
+        }
+        CV_DbgAssert( found );
+#endif
+
+        CV_DbgAssert( memStack->tip == memStack->datastart );
+
+        memStack->isFree = true;
+    }
+}
+
+#endif
+
 ////////////////////////////////////////////////////////////////
-// Stream
+/// Stream::Impl
 
 #ifndef HAVE_CUDA
 
@@ -74,10 +283,6 @@ public:
     ~Impl();
 };
 
-cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
-{
-}
-
 cv::cuda::Stream::Impl::Impl() : stream(0)
 {
     cudaSafeCall( cudaStreamCreate(&stream) );
@@ -98,13 +303,120 @@ cv::cuda::Stream::Impl::~Impl()
         cudaStreamDestroy(stream);
 }
 
-cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
+#endif
+
+/////////////////////////////////////////////////////////////
+/// DefaultDeviceInitializer
+
+#ifdef HAVE_CUDA
+
+namespace cv { namespace cuda
 {
-    return stream.impl_->stream;
-}
+    class DefaultDeviceInitializer
+    {
+    public:
+        DefaultDeviceInitializer();
+        ~DefaultDeviceInitializer();
+
+        Stream& getNullStream(int deviceId);
+        MemoryPool* getMemoryPool(int deviceId);
+
+    private:
+        void initStreams();
+        void initPools();
+
+        Mutex streams_mtx_;
+        volatile bool streams_initialized_;
+
+        Mutex pools_mtx_;
+        volatile bool pools_initialized_;
+
+        std::vector<Ptr<Stream> > streams_;
+        std::vector<MemoryPool> pools_;
+    };
+
+    DefaultDeviceInitializer::DefaultDeviceInitializer()
+    {
+    }
+
+    DefaultDeviceInitializer::~DefaultDeviceInitializer()
+    {
+        streams_.clear();
+
+        for (size_t i = 0; i < pools_.size(); ++i)
+        {
+            cudaSetDevice(static_cast<int>(i));
+            pools_[i].release();
+        }
+
+        pools_.clear();
+    }
+
+    Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
+    {
+        initStreams();
+
+        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
+
+        return *streams_[deviceId];
+    }
+
+    MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
+    {
+        initPools();
+
+        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
+
+        return &pools_[deviceId];
+    }
+
+    void DefaultDeviceInitializer::initStreams()
+    {
+        AutoLock lock(streams_mtx_);
+
+        if (!streams_initialized_)
+        {
+            int deviceCount = getCudaEnabledDeviceCount();
+
+            if (deviceCount > 0)
+            {
+                streams_.resize(deviceCount);
+
+                for (int i = 0; i < deviceCount; ++i)
+                {
+                    cudaStream_t stream = NULL;
+                    Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
+                    streams_[i] = Ptr<Stream>(new Stream(impl));
+                }
+            }
+
+            streams_initialized_ = true;
+        }
+    }
+
+    void DefaultDeviceInitializer::initPools()
+    {
+        AutoLock lock(pools_mtx_);
+
+        if (!pools_initialized_)
+        {
+            int deviceCount = getCudaEnabledDeviceCount();
+
+            if (deviceCount > 0)
+                pools_.resize(deviceCount);
+
+            pools_initialized_ = true;
+        }
+    }
+
+    DefaultDeviceInitializer initializer;
+}}
 
 #endif
 
+/////////////////////////////////////////////////////////////
+/// Stream
+
 cv::cuda::Stream::Stream()
 {
 #ifndef HAVE_CUDA
@@ -181,7 +493,7 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
     #if CUDART_VERSION < 5000
         (void) callback;
         (void) userData;
-        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0");
+        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
     #else
         CallbackData* data = new CallbackData(callback, userData);
 
@@ -190,22 +502,16 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa
 #endif
 }
 
-namespace
-{
-    bool default_stream_is_initialized;
-    Mutex mtx;
-    Ptr<Stream> default_stream;
-}
-
 Stream& cv::cuda::Stream::Null()
 {
-    AutoLock lock(mtx);
-    if (!default_stream_is_initialized)
-    {
-        default_stream = Ptr<Stream>(new Stream(Ptr<Impl>(new Impl(0))));
-        default_stream_is_initialized = true;
-    }
-    return *default_stream;
+#ifndef HAVE_CUDA
+    throw_no_cuda();
+    static Stream stream;
+    return stream;
+#else
+    const int deviceId = getDevice();
+    return initializer.getNullStream(deviceId);
+#endif
 }
 
 cv::cuda::Stream::operator bool_type() const
@@ -217,6 +523,142 @@ cv::cuda::Stream::operator bool_type() const
 #endif
 }
 
+#ifdef HAVE_CUDA
+
+cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
+{
+    return stream.impl_->stream;
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////
+/// StackAllocator
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    bool enableMemoryPool = true;
+}
+
+cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
+{
+    if (enableMemoryPool)
+    {
+        const int deviceId = getDevice();
+        memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
+        DeviceInfo devInfo(deviceId);
+        alignment_ = devInfo.textureAlignment();
+    }
+}
+
+cv::cuda::StackAllocator::~StackAllocator()
+{
+    cudaStreamSynchronize(stream_);
+
+    if (memStack_ != 0)
+        memStack_->pool->returnMemStack(memStack_);
+}
+
+namespace
+{
+    size_t alignUp(size_t what, size_t alignment)
+    {
+        size_t alignMask = alignment-1;
+        size_t inverseAlignMask = ~alignMask;
+        size_t res = (what + alignMask) & inverseAlignMask;
+        return res;
+    }
+}
+
+bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
+{
+    if (memStack_ == 0)
+        return false;
+
+    size_t pitch, memSize;
+
+    if (rows > 1 && cols > 1)
+    {
+        pitch = alignUp(cols * elemSize, alignment_);
+        memSize = pitch * rows;
+    }
+    else
+    {
+        // Single row or single column must be continuous
+        pitch = elemSize * cols;
+        memSize = alignUp(elemSize * cols * rows, 64);
+    }
+
+    uchar* ptr = memStack_->requestMemory(memSize);
+
+    if (ptr == 0)
+        return false;
+
+    mat->data = ptr;
+    mat->step = pitch;
+    mat->refcount = (int*) fastMalloc(sizeof(int));
+
+    return true;
+}
+
+void cv::cuda::StackAllocator::free(GpuMat* mat)
+{
+    if (memStack_ == 0)
+        return;
+
+    memStack_->returnMemory(mat->datastart);
+    fastFree(mat->refcount);
+}
+
+void cv::cuda::setBufferPoolUsage(bool on)
+{
+    enableMemoryPool = on;
+}
+
+void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
+{
+    const int currentDevice = getDevice();
+
+    if (deviceId >= 0)
+    {
+        setDevice(deviceId);
+        initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
+    }
+    else
+    {
+        const int deviceCount = getCudaEnabledDeviceCount();
+
+        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
+        {
+            setDevice(deviceId);
+            initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
+        }
+    }
+
+    setDevice(currentDevice);
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////
+/// BufferPool
+
+#ifdef HAVE_CUDA
+
+cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
+{
+}
+
+GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
+{
+    GpuMat buf(allocator_);
+    buf.create(rows, cols, type);
+    return buf;
+}
+
+#endif
 
 ////////////////////////////////////////////////////////////////
 // Event