From 7ed38b97c3726c14155865267b054d2ad6049f41 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 17 Dec 2014 18:51:15 +0300 Subject: [PATCH] fix cuda::BufferPool deinitialization The deinitialization of BufferPool internal objects is controled by global object, but it depends on other global objects, which leads to errors caused by undefined deinitialization order of global objects. I merge global objects initialization into single class, which performs initialization and deinitialization in correct order. --- modules/core/include/opencv2/core/cuda.hpp | 1 + modules/core/src/cuda_buffer_pool.cpp | 435 ------------------ modules/core/src/cuda_stream.cpp | 488 ++++++++++++++++++++- 3 files changed, 466 insertions(+), 458 deletions(-) delete mode 100644 modules/core/src/cuda_buffer_pool.cpp diff --git a/modules/core/include/opencv2/core/cuda.hpp b/modules/core/include/opencv2/core/cuda.hpp index 15d526e80..8e0944061 100644 --- a/modules/core/include/opencv2/core/cuda.hpp +++ b/modules/core/include/opencv2/core/cuda.hpp @@ -479,6 +479,7 @@ private: friend struct StreamAccessor; friend class BufferPool; + friend class DefaultDeviceInitializer; }; class CV_EXPORTS Event diff --git a/modules/core/src/cuda_buffer_pool.cpp b/modules/core/src/cuda_buffer_pool.cpp deleted file mode 100644 index e5caf6ef2..000000000 --- a/modules/core/src/cuda_buffer_pool.cpp +++ /dev/null @@ -1,435 +0,0 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - -#include "precomp.hpp" - -using namespace cv; -using namespace cv::cuda; - -#ifdef HAVE_CUDA - -#include "opencv2/cudev/common.hpp" - -///////////////////////////////////////////////////////////// -/// MemoryStack - -namespace -{ - class MemoryPool; -} - -class cv::cuda::MemoryStack -{ -public: - uchar* requestMemory(size_t size); - void returnMemory(uchar* ptr); - - uchar* datastart; - uchar* dataend; - uchar* tip; - - bool isFree; - MemoryPool* pool; - -#if defined(DEBUG) || defined(_DEBUG) - std::vector allocations; -#endif -}; - -uchar* cv::cuda::MemoryStack::requestMemory(size_t size) -{ - const size_t freeMem = dataend - tip; - - if (size > freeMem) - return 0; - - uchar* ptr = tip; - - tip += size; - -#if defined(DEBUG) || defined(_DEBUG) - allocations.push_back(size); -#endif - - return ptr; -} - -void cv::cuda::MemoryStack::returnMemory(uchar* ptr) -{ - CV_DbgAssert( ptr >= datastart && ptr < dataend ); - -#if defined(DEBUG) || defined(_DEBUG) - const size_t allocSize = tip - ptr; - CV_Assert( allocSize == allocations.back() ); - allocations.pop_back(); -#endif - - tip = ptr; -} - -///////////////////////////////////////////////////////////// -/// MemoryPool - -namespace -{ - class MemoryPool - { - public: - MemoryPool(); - - void initialize(size_t stackSize, int stackCount); - void release(); - - MemoryStack* getFreeMemStack(); - void returnMemStack(MemoryStack* memStack); - - private: - void initilizeImpl(); - - Mutex mtx_; - - bool initialized_; - size_t stackSize_; - int stackCount_; - - uchar* mem_; - - std::vector stacks_; - }; - - MemoryPool::MemoryPool() : initialized_(false), mem_(0) - { - // default : 10 Mb, 5 stacks - stackSize_ = 10 * 1024 * 1024; - stackCount_ = 5; - } - - void MemoryPool::initialize(size_t stackSize, int stackCount) - { - AutoLock lock(mtx_); - - release(); - - stackSize_ = stackSize; - stackCount_ = stackCount; - - initilizeImpl(); - } - - void MemoryPool::initilizeImpl() - { - const size_t totalSize = stackSize_ * stackCount_; - - if (totalSize > 0) - { - cudaError_t err = cudaMalloc(&mem_, totalSize); - if (err != cudaSuccess) - return; - - stacks_.resize(stackCount_); - - uchar* ptr = mem_; - - for (int i = 0; i < stackCount_; ++i) - { - stacks_[i].datastart = ptr; - stacks_[i].dataend = ptr + stackSize_; - stacks_[i].tip = ptr; - stacks_[i].isFree = true; - stacks_[i].pool = this; - - ptr += stackSize_; - } - - initialized_ = true; - } - } - - void MemoryPool::release() - { - if (mem_) - { -#if defined(DEBUG) || defined(_DEBUG) - for (int i = 0; i < stackCount_; ++i) - { - CV_DbgAssert( stacks_[i].isFree ); - CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart ); - } -#endif - - cudaFree( mem_ ); - - mem_ = 0; - initialized_ = false; - } - } - - MemoryStack* MemoryPool::getFreeMemStack() - { - AutoLock lock(mtx_); - if (!initialized_) - initilizeImpl(); - - if (!mem_) - return 0; - - for (int i = 0; i < stackCount_; ++i) - { - if (stacks_[i].isFree) - { - stacks_[i].isFree = false; - return &stacks_[i]; - } - } - - return 0; - } - - void MemoryPool::returnMemStack(MemoryStack* memStack) - { - AutoLock lock(mtx_); - - CV_DbgAssert( !memStack->isFree ); - -#if defined(DEBUG) || defined(_DEBUG) - bool found = false; - for (int i = 0; i < stackCount_; ++i) - { - if (memStack == &stacks_[i]) - { - found = true; - break; - } - } - CV_DbgAssert( found ); -#endif - - CV_DbgAssert( memStack->tip == memStack->datastart ); - - memStack->isFree = true; - } -} - -///////////////////////////////////////////////////////////// -/// MemoryPoolManager - -namespace -{ - Mutex mtx_; - bool memory_pool_manager_initialized; - - class MemoryPoolManager - { - public: - MemoryPoolManager(); - ~MemoryPoolManager(); - void Init(); - - MemoryPool* getPool(int deviceId); - - private: - std::vector pools_; - } manager; - - //MemoryPoolManager ; - - MemoryPoolManager::MemoryPoolManager() - { - } - - void MemoryPoolManager::Init() - { - int deviceCount = getCudaEnabledDeviceCount(); - if (deviceCount > 0) - pools_.resize(deviceCount); - } - - MemoryPoolManager::~MemoryPoolManager() - { - for (size_t i = 0; i < pools_.size(); ++i) - { - cudaSetDevice(static_cast(i)); - pools_[i].release(); - } - } - - MemoryPool* MemoryPoolManager::getPool(int deviceId) - { - CV_DbgAssert( deviceId >= 0 && deviceId < static_cast(pools_.size()) ); - return &pools_[deviceId]; - } - - MemoryPool* memPool(int deviceId) - { - { - AutoLock lock(mtx_); - if (!memory_pool_manager_initialized) - { - memory_pool_manager_initialized = true; - manager.Init(); - } - } - return manager.getPool(deviceId); - } -} - -///////////////////////////////////////////////////////////// -/// StackAllocator - -namespace -{ - bool enableMemoryPool = true; -} - -cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0) -{ - if (enableMemoryPool) - { - const int deviceId = getDevice(); - { - AutoLock lock(mtx_); - memStack_ = memPool(deviceId)->getFreeMemStack(); - } - DeviceInfo devInfo(deviceId); - alignment_ = devInfo.textureAlignment(); - } -} - -cv::cuda::StackAllocator::~StackAllocator() -{ - cudaStreamSynchronize(stream_); - - if (memStack_ != 0) - memStack_->pool->returnMemStack(memStack_); -} - -namespace -{ - size_t alignUp(size_t what, size_t alignment) - { - size_t alignMask = alignment-1; - size_t inverseAlignMask = ~alignMask; - size_t res = (what + alignMask) & inverseAlignMask; - return res; - } -} - -bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize) -{ - if (memStack_ == 0) - return false; - - size_t pitch, memSize; - - if (rows > 1 && cols > 1) - { - pitch = alignUp(cols * elemSize, alignment_); - memSize = pitch * rows; - } - else - { - // Single row or single column must be continuous - pitch = elemSize * cols; - memSize = alignUp(elemSize * cols * rows, 64); - } - - uchar* ptr = memStack_->requestMemory(memSize); - - if (ptr == 0) - return false; - - mat->data = ptr; - mat->step = pitch; - mat->refcount = (int*) fastMalloc(sizeof(int)); - - return true; -} - -void cv::cuda::StackAllocator::free(GpuMat* mat) -{ - if (memStack_ == 0) - return; - - memStack_->returnMemory(mat->datastart); - fastFree(mat->refcount); -} - -void cv::cuda::setBufferPoolUsage(bool on) -{ - enableMemoryPool = on; -} - -void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount) -{ - const int currentDevice = getDevice(); - - if (deviceId >= 0) - { - setDevice(deviceId); - memPool(deviceId)->initialize(stackSize, stackCount); - } - else - { - const int deviceCount = getCudaEnabledDeviceCount(); - - for (deviceId = 0; deviceId < deviceCount; ++deviceId) - { - setDevice(deviceId); - memPool(deviceId)->initialize(stackSize, stackCount); - } - } - - setDevice(currentDevice); -} - -///////////////////////////////////////////////////////////// -/// BufferPool - -GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type) -{ - GpuMat buf(allocator_); - buf.create(rows, cols, type); - return buf; -} - -#endif diff --git a/modules/core/src/cuda_stream.cpp b/modules/core/src/cuda_stream.cpp index 98a29df19..efcf9cb3e 100644 --- a/modules/core/src/cuda_stream.cpp +++ b/modules/core/src/cuda_stream.cpp @@ -45,8 +45,217 @@ using namespace cv; using namespace cv::cuda; +///////////////////////////////////////////////////////////// +/// MemoryStack + +#ifdef HAVE_CUDA + +namespace +{ + class MemoryPool; +} + +class cv::cuda::MemoryStack +{ +public: + uchar* requestMemory(size_t size); + void returnMemory(uchar* ptr); + + uchar* datastart; + uchar* dataend; + uchar* tip; + + bool isFree; + MemoryPool* pool; + +#if !defined(NDEBUG) + std::vector allocations; +#endif +}; + +uchar* cv::cuda::MemoryStack::requestMemory(size_t size) +{ + const size_t freeMem = dataend - tip; + + if (size > freeMem) + return 0; + + uchar* ptr = tip; + + tip += size; + +#if !defined(NDEBUG) + allocations.push_back(size); +#endif + + return ptr; +} + +void cv::cuda::MemoryStack::returnMemory(uchar* ptr) +{ + CV_DbgAssert( ptr >= datastart && ptr < dataend ); + +#if !defined(NDEBUG) + const size_t allocSize = tip - ptr; + CV_Assert( allocSize == allocations.back() ); + allocations.pop_back(); +#endif + + tip = ptr; +} + +#endif + +///////////////////////////////////////////////////////////// +/// MemoryPool + +#ifdef HAVE_CUDA + +namespace +{ + class MemoryPool + { + public: + MemoryPool(); + + void initialize(size_t stackSize, int stackCount); + void release(); + + MemoryStack* getFreeMemStack(); + void returnMemStack(MemoryStack* memStack); + + private: + void initilizeImpl(); + + Mutex mtx_; + + bool initialized_; + size_t stackSize_; + int stackCount_; + + uchar* mem_; + + std::vector stacks_; + }; + + MemoryPool::MemoryPool() : initialized_(false), mem_(0) + { + // default : 10 Mb, 5 stacks + stackSize_ = 10 * 1024 * 1024; + stackCount_ = 5; + } + + void MemoryPool::initialize(size_t stackSize, int stackCount) + { + AutoLock lock(mtx_); + + release(); + + stackSize_ = stackSize; + stackCount_ = stackCount; + + initilizeImpl(); + } + + void MemoryPool::initilizeImpl() + { + const size_t totalSize = stackSize_ * stackCount_; + + if (totalSize > 0) + { + cudaError_t err = cudaMalloc(&mem_, totalSize); + if (err != cudaSuccess) + return; + + stacks_.resize(stackCount_); + + uchar* ptr = mem_; + + for (int i = 0; i < stackCount_; ++i) + { + stacks_[i].datastart = ptr; + stacks_[i].dataend = ptr + stackSize_; + stacks_[i].tip = ptr; + stacks_[i].isFree = true; + stacks_[i].pool = this; + + ptr += stackSize_; + } + + initialized_ = true; + } + } + + void MemoryPool::release() + { + if (mem_) + { +#if !defined(NDEBUG) + for (int i = 0; i < stackCount_; ++i) + { + CV_DbgAssert( stacks_[i].isFree ); + CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart ); + } +#endif + + cudaFree(mem_); + + mem_ = 0; + initialized_ = false; + } + } + + MemoryStack* MemoryPool::getFreeMemStack() + { + AutoLock lock(mtx_); + + if (!initialized_) + initilizeImpl(); + + if (!mem_) + return 0; + + for (int i = 0; i < stackCount_; ++i) + { + if (stacks_[i].isFree) + { + stacks_[i].isFree = false; + return &stacks_[i]; + } + } + + return 0; + } + + void MemoryPool::returnMemStack(MemoryStack* memStack) + { + AutoLock lock(mtx_); + + CV_DbgAssert( !memStack->isFree ); + +#if !defined(NDEBUG) + bool found = false; + for (int i = 0; i < stackCount_; ++i) + { + if (memStack == &stacks_[i]) + { + found = true; + break; + } + } + CV_DbgAssert( found ); +#endif + + CV_DbgAssert( memStack->tip == memStack->datastart ); + + memStack->isFree = true; + } +} + +#endif + //////////////////////////////////////////////////////////////// -// Stream +/// Stream::Impl #ifndef HAVE_CUDA @@ -74,10 +283,6 @@ public: ~Impl(); }; -cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get()) -{ -} - cv::cuda::Stream::Impl::Impl() : stream(0) { cudaSafeCall( cudaStreamCreate(&stream) ); @@ -98,13 +303,120 @@ cv::cuda::Stream::Impl::~Impl() cudaStreamDestroy(stream); } -cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream) +#endif + +///////////////////////////////////////////////////////////// +/// DefaultDeviceInitializer + +#ifdef HAVE_CUDA + +namespace cv { namespace cuda { - return stream.impl_->stream; -} + class DefaultDeviceInitializer + { + public: + DefaultDeviceInitializer(); + ~DefaultDeviceInitializer(); + + Stream& getNullStream(int deviceId); + MemoryPool* getMemoryPool(int deviceId); + + private: + void initStreams(); + void initPools(); + + Mutex streams_mtx_; + volatile bool streams_initialized_; + + Mutex pools_mtx_; + volatile bool pools_initialized_; + + std::vector > streams_; + std::vector pools_; + }; + + DefaultDeviceInitializer::DefaultDeviceInitializer() + { + } + + DefaultDeviceInitializer::~DefaultDeviceInitializer() + { + streams_.clear(); + + for (size_t i = 0; i < pools_.size(); ++i) + { + cudaSetDevice(static_cast(i)); + pools_[i].release(); + } + + pools_.clear(); + } + + Stream& DefaultDeviceInitializer::getNullStream(int deviceId) + { + initStreams(); + + CV_DbgAssert( deviceId >= 0 && deviceId < static_cast(streams_.size()) ); + + return *streams_[deviceId]; + } + + MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId) + { + initPools(); + + CV_DbgAssert( deviceId >= 0 && deviceId < static_cast(pools_.size()) ); + + return &pools_[deviceId]; + } + + void DefaultDeviceInitializer::initStreams() + { + AutoLock lock(streams_mtx_); + + if (!streams_initialized_) + { + int deviceCount = getCudaEnabledDeviceCount(); + + if (deviceCount > 0) + { + streams_.resize(deviceCount); + + for (int i = 0; i < deviceCount; ++i) + { + cudaStream_t stream = NULL; + Ptr impl = makePtr(stream); + streams_[i] = Ptr(new Stream(impl)); + } + } + + streams_initialized_ = true; + } + } + + void DefaultDeviceInitializer::initPools() + { + AutoLock lock(pools_mtx_); + + if (!pools_initialized_) + { + int deviceCount = getCudaEnabledDeviceCount(); + + if (deviceCount > 0) + pools_.resize(deviceCount); + + pools_initialized_ = true; + } + } + + DefaultDeviceInitializer initializer; +}} #endif +///////////////////////////////////////////////////////////// +/// Stream + cv::cuda::Stream::Stream() { #ifndef HAVE_CUDA @@ -181,7 +493,7 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa #if CUDART_VERSION < 5000 (void) callback; (void) userData; - CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA 5.0"); + CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0"); #else CallbackData* data = new CallbackData(callback, userData); @@ -190,22 +502,16 @@ void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userDa #endif } -namespace -{ - bool default_stream_is_initialized; - Mutex mtx; - Ptr default_stream; -} - Stream& cv::cuda::Stream::Null() { - AutoLock lock(mtx); - if (!default_stream_is_initialized) - { - default_stream = Ptr(new Stream(Ptr(new Impl(0)))); - default_stream_is_initialized = true; - } - return *default_stream; +#ifndef HAVE_CUDA + throw_no_cuda(); + static Stream stream; + return stream; +#else + const int deviceId = getDevice(); + return initializer.getNullStream(deviceId); +#endif } cv::cuda::Stream::operator bool_type() const @@ -217,6 +523,142 @@ cv::cuda::Stream::operator bool_type() const #endif } +#ifdef HAVE_CUDA + +cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream) +{ + return stream.impl_->stream; +} + +#endif + +///////////////////////////////////////////////////////////// +/// StackAllocator + +#ifdef HAVE_CUDA + +namespace +{ + bool enableMemoryPool = true; +} + +cv::cuda::StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0) +{ + if (enableMemoryPool) + { + const int deviceId = getDevice(); + memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack(); + DeviceInfo devInfo(deviceId); + alignment_ = devInfo.textureAlignment(); + } +} + +cv::cuda::StackAllocator::~StackAllocator() +{ + cudaStreamSynchronize(stream_); + + if (memStack_ != 0) + memStack_->pool->returnMemStack(memStack_); +} + +namespace +{ + size_t alignUp(size_t what, size_t alignment) + { + size_t alignMask = alignment-1; + size_t inverseAlignMask = ~alignMask; + size_t res = (what + alignMask) & inverseAlignMask; + return res; + } +} + +bool cv::cuda::StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize) +{ + if (memStack_ == 0) + return false; + + size_t pitch, memSize; + + if (rows > 1 && cols > 1) + { + pitch = alignUp(cols * elemSize, alignment_); + memSize = pitch * rows; + } + else + { + // Single row or single column must be continuous + pitch = elemSize * cols; + memSize = alignUp(elemSize * cols * rows, 64); + } + + uchar* ptr = memStack_->requestMemory(memSize); + + if (ptr == 0) + return false; + + mat->data = ptr; + mat->step = pitch; + mat->refcount = (int*) fastMalloc(sizeof(int)); + + return true; +} + +void cv::cuda::StackAllocator::free(GpuMat* mat) +{ + if (memStack_ == 0) + return; + + memStack_->returnMemory(mat->datastart); + fastFree(mat->refcount); +} + +void cv::cuda::setBufferPoolUsage(bool on) +{ + enableMemoryPool = on; +} + +void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount) +{ + const int currentDevice = getDevice(); + + if (deviceId >= 0) + { + setDevice(deviceId); + initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); + } + else + { + const int deviceCount = getCudaEnabledDeviceCount(); + + for (deviceId = 0; deviceId < deviceCount; ++deviceId) + { + setDevice(deviceId); + initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); + } + } + + setDevice(currentDevice); +} + +#endif + +///////////////////////////////////////////////////////////// +/// BufferPool + +#ifdef HAVE_CUDA + +cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get()) +{ +} + +GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type) +{ + GpuMat buf(allocator_); + buf.create(rows, cols, type); + return buf; +} + +#endif //////////////////////////////////////////////////////////////// // Event