From 988ab79acb307f2be951dba565cb48ee286ae2b2 Mon Sep 17 00:00:00 2001
From: Vladislav Vinogradov <vlad.vinogradov@itseez.com>
Date: Thu, 3 Oct 2013 17:22:18 +0400
Subject: [PATCH] added BufferAllocator

---
 .../include/opencv2/core/private.cuda.hpp     |  23 +
 modules/core/src/cuda_buffer_pool.cpp         | 416 ++++++++++++++++++
 modules/core/src/cuda_stream.cpp              |   2 +-
 modules/cuda/perf/perf_buffer_pool.cpp        | 114 +++++
 modules/cuda/test/test_buffer_pool.cpp        | 120 +++++
 5 files changed, 674 insertions(+), 1 deletion(-)
 create mode 100644 modules/core/src/cuda_buffer_pool.cpp
 create mode 100644 modules/cuda/perf/perf_buffer_pool.cpp
 create mode 100644 modules/cuda/test/test_buffer_pool.cpp

diff --git a/modules/core/include/opencv2/core/private.cuda.hpp b/modules/core/include/opencv2/core/private.cuda.hpp
index 3c4523183..3760c1888 100644
--- a/modules/core/include/opencv2/core/private.cuda.hpp
+++ b/modules/core/include/opencv2/core/private.cuda.hpp
@@ -90,6 +90,29 @@ static inline void throw_no_cuda() { CV_Error(cv::Error::StsNotImplemented, "The
 
 namespace cv { namespace cuda
 {
+    class MemoryStack;
+
+    class CV_EXPORTS BufferAllocator : public GpuMat::Allocator
+    {
+    public:
+        explicit BufferAllocator(Stream& stream);
+        ~BufferAllocator();
+
+        bool allocate(uchar** devPtr, size_t* step, int** refcount, int rows, int cols, size_t elemSize);
+        void free(uchar* devPtr, int* refcount);
+
+    private:
+        BufferAllocator(const BufferAllocator&);
+        BufferAllocator& operator =(const BufferAllocator&);
+
+        MemoryStack* memStack_;
+        Stream stream_;
+        size_t alignment_;
+    };
+
+    CV_EXPORTS void setBufferAllocatorUsage(bool on);
+    CV_EXPORTS void allocateMemoryPool(int deviceId, size_t stackSize, int stackCount);
+
     static inline void checkNppError(int code, const char* file, const int line, const char* func)
     {
         if (code < 0)
diff --git a/modules/core/src/cuda_buffer_pool.cpp b/modules/core/src/cuda_buffer_pool.cpp
new file mode 100644
index 000000000..162811eba
--- /dev/null
+++ b/modules/core/src/cuda_buffer_pool.cpp
@@ -0,0 +1,416 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#ifdef HAVE_CUDA
+
+#include "opencv2/cudev/common.hpp"
+
+/////////////////////////////////////////////////////////////
+/// MemoryStack
+
+namespace
+{
+    class MemoryPool;
+}
+
+class cv::cuda::MemoryStack
+{
+public:
+    uchar* requestMemory(size_t size);
+    void returnMemory(uchar* ptr);
+
+    uchar* datastart;
+    uchar* dataend;
+    uchar* tip;
+
+    bool isFree;
+    MemoryPool* pool;
+
+#if defined(DEBUG) || defined(_DEBUG)
+    std::vector<size_t> allocations;
+#endif
+};
+
+uchar* cv::cuda::MemoryStack::requestMemory(size_t size)
+{
+    const size_t freeMem = dataend - tip;
+
+    if (size > freeMem)
+        return 0;
+
+    uchar* ptr = tip;
+
+    tip += size;
+
+#if defined(DEBUG) || defined(_DEBUG)
+    allocations.push_back(size);
+#endif
+
+    return ptr;
+}
+
+void cv::cuda::MemoryStack::returnMemory(uchar* ptr)
+{
+    CV_DbgAssert( ptr >= datastart && ptr < dataend );
+
+#if defined(DEBUG) || defined(_DEBUG)
+    const size_t allocSize = tip - ptr;
+    CV_Assert( allocSize == allocations.back() );
+    allocations.pop_back();
+#endif
+
+    tip = ptr;
+}
+
+/////////////////////////////////////////////////////////////
+/// MemoryPool
+
+namespace
+{
+    class MemoryPool
+    {
+    public:
+        MemoryPool();
+
+        void initialize(size_t stackSize, int stackCount);
+        void release();
+
+        MemoryStack* getFreeMemStack();
+        void returnMemStack(MemoryStack* memStack);
+
+    private:
+        void initilizeImpl();
+
+        Mutex mtx_;
+
+        bool initialized_;
+        size_t stackSize_;
+        int stackCount_;
+
+        uchar* mem_;
+
+        std::vector<MemoryStack> stacks_;
+    };
+
+    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
+    {
+        // default : 10 Mb, 5 stacks
+        stackSize_ = 10 * 1024 * 1024;
+        stackCount_ = 5;
+    }
+
+    void MemoryPool::initialize(size_t stackSize, int stackCount)
+    {
+        AutoLock lock(mtx_);
+
+        release();
+
+        stackSize_ = stackSize;
+        stackCount_ = stackCount;
+
+        initilizeImpl();
+    }
+
+    void MemoryPool::initilizeImpl()
+    {
+        const size_t totalSize = stackSize_ * stackCount_;
+
+        if (totalSize > 0)
+        {
+            cudaError_t err = cudaMalloc(&mem_, totalSize);
+            if (err != cudaSuccess)
+                return;
+
+            stacks_.resize(stackCount_);
+
+            uchar* ptr = mem_;
+
+            for (int i = 0; i < stackCount_; ++i)
+            {
+                stacks_[i].datastart = ptr;
+                stacks_[i].dataend = ptr + stackSize_;
+                stacks_[i].tip = ptr;
+                stacks_[i].isFree = true;
+                stacks_[i].pool = this;
+
+                ptr += stackSize_;
+            }
+
+            initialized_ = true;
+        }
+    }
+
+    void MemoryPool::release()
+    {
+        if (mem_)
+        {
+#if defined(DEBUG) || defined(_DEBUG)
+            for (int i = 0; i < stackCount_; ++i)
+            {
+                CV_DbgAssert( stacks_[i].isFree );
+                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
+            }
+#endif
+
+            cudaFree( mem_ );
+
+            mem_ = 0;
+            initialized_ = false;
+        }
+    }
+
+    MemoryStack* MemoryPool::getFreeMemStack()
+    {
+        AutoLock lock(mtx_);
+
+        if (!initialized_)
+            initilizeImpl();
+
+        if (!mem_)
+            return 0;
+
+        for (int i = 0; i < stackCount_; ++i)
+        {
+            if (stacks_[i].isFree)
+            {
+                stacks_[i].isFree = false;
+                return &stacks_[i];
+            }
+        }
+
+        return 0;
+    }
+
+    void MemoryPool::returnMemStack(MemoryStack* memStack)
+    {
+        AutoLock lock(mtx_);
+
+        CV_DbgAssert( !memStack->isFree );
+
+#if defined(DEBUG) || defined(_DEBUG)
+        bool found = false;
+        for (int i = 0; i < stackCount_; ++i)
+        {
+            if (memStack == &stacks_[i])
+            {
+                found = true;
+                break;
+            }
+        }
+        CV_DbgAssert( found );
+#endif
+
+        CV_DbgAssert( memStack->tip == memStack->datastart );
+
+        memStack->isFree = true;
+    }
+}
+
+/////////////////////////////////////////////////////////////
+/// MemoryPoolManager
+
+namespace
+{
+    class MemoryPoolManager
+    {
+    public:
+        MemoryPoolManager();
+        ~MemoryPoolManager();
+
+        MemoryPool* getPool(int deviceId);
+
+    private:
+        std::vector<MemoryPool> pools_;
+    };
+
+    MemoryPoolManager::MemoryPoolManager()
+    {
+        int deviceCount = getCudaEnabledDeviceCount();
+
+        if (deviceCount > 0)
+            pools_.resize(deviceCount);
+    }
+
+    MemoryPoolManager::~MemoryPoolManager()
+    {
+        for (size_t i = 0; i < pools_.size(); ++i)
+        {
+            cudaSetDevice(i);
+            pools_[i].release();
+        }
+    }
+
+    MemoryPool* MemoryPoolManager::getPool(int deviceId)
+    {
+        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
+        return &pools_[deviceId];
+    }
+
+    MemoryPool* memPool(int deviceId)
+    {
+        static MemoryPoolManager manager;
+        return manager.getPool(deviceId);
+    }
+}
+
+/////////////////////////////////////////////////////////////
+/// BufferAllocator
+
+namespace
+{
+    bool enableMemoryPool = true;
+}
+
+cv::cuda::BufferAllocator::BufferAllocator(Stream& stream) : memStack_(0), stream_(stream)
+{
+    if (enableMemoryPool)
+    {
+        const int deviceId = getDevice();
+        memStack_ = memPool(deviceId)->getFreeMemStack();
+
+        DeviceInfo devInfo(deviceId);
+        alignment_ = devInfo.textureAlignment();
+    }
+}
+
+namespace
+{
+    void CUDART_CB returnMemStackCallback(cudaStream_t, cudaError_t, void* userData)
+    {
+        MemoryStack* memStack = static_cast<MemoryStack*>(userData);
+        memStack->pool->returnMemStack(memStack);
+    }
+}
+
+cv::cuda::BufferAllocator::~BufferAllocator()
+{
+    if (memStack_ != 0)
+        CV_CUDEV_SAFE_CALL( cudaStreamAddCallback(StreamAccessor::getStream(stream_), returnMemStackCallback, memStack_, 0) );
+}
+
+namespace
+{
+    size_t alignUp(size_t what, size_t alignment)
+    {
+        size_t alignMask = alignment-1;
+        size_t inverseAlignMask = ~alignMask;
+        size_t res = (what + alignMask) & inverseAlignMask;
+        return res;
+    }
+}
+
+bool cv::cuda::BufferAllocator::allocate(uchar** devPtr, size_t* step, int** refcount, int rows, int cols, size_t elemSize)
+{
+    if (memStack_ == 0)
+        return false;
+
+    size_t pitch, memSize;
+
+    if (rows > 1 && cols > 1)
+    {
+        pitch = alignUp(cols * elemSize, alignment_);
+        memSize = pitch * rows;
+    }
+    else
+    {
+        // Single row or single column must be continuous
+        pitch = elemSize * cols;
+        memSize = alignUp(elemSize * cols * rows, 64);
+    }
+
+    uchar* ptr = memStack_->requestMemory(memSize);
+
+    if (ptr == 0)
+        return false;
+
+    *devPtr = ptr;
+    *step = pitch;
+
+    *refcount = static_cast<int*>(fastMalloc(sizeof(int)));
+
+    return true;
+}
+
+void cv::cuda::BufferAllocator::free(uchar* devPtr, int* refcount)
+{
+    if (memStack_ == 0)
+        return;
+
+    memStack_->returnMemory(devPtr);
+    fastFree(refcount);
+}
+
+void cv::cuda::setBufferAllocatorUsage(bool on)
+{
+    enableMemoryPool = on;
+}
+
+void cv::cuda::allocateMemoryPool(int deviceId, size_t stackSize, int stackCount)
+{
+    const int currentDevice = getDevice();
+
+    if (deviceId >= 0)
+    {
+        setDevice(deviceId);
+        memPool(deviceId)->initialize(stackSize, stackCount);
+    }
+    else
+    {
+        const int deviceCount = getCudaEnabledDeviceCount();
+
+        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
+        {
+            setDevice(deviceId);
+            memPool(deviceId)->initialize(stackSize, stackCount);
+        }
+    }
+
+    setDevice(currentDevice);
+}
+
+#endif
diff --git a/modules/core/src/cuda_stream.cpp b/modules/core/src/cuda_stream.cpp
index 3fdc83867..be298919b 100644
--- a/modules/core/src/cuda_stream.cpp
+++ b/modules/core/src/cuda_stream.cpp
@@ -197,7 +197,7 @@ cv::cuda::Stream::operator bool_type() const
 
 
 ////////////////////////////////////////////////////////////////
-// Stream
+// Event
 
 #ifndef HAVE_CUDA
 
diff --git a/modules/cuda/perf/perf_buffer_pool.cpp b/modules/cuda/perf/perf_buffer_pool.cpp
new file mode 100644
index 000000000..9bab24bbc
--- /dev/null
+++ b/modules/cuda/perf/perf_buffer_pool.cpp
@@ -0,0 +1,114 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace testing;
+using namespace perf;
+using namespace cv;
+using namespace cv::cuda;
+
+namespace
+{
+    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
+    {
+        BufferAllocator bufAlloc(stream);
+
+        GpuMat buf(&bufAlloc);
+
+        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
+
+        cuda::exp(buf, dst, stream);
+    }
+
+    void func2(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+    {
+        BufferAllocator bufAlloc(stream);
+
+        GpuMat buf1(&bufAlloc);
+
+        func1(src1, buf1, stream);
+
+        GpuMat buf2(&bufAlloc);
+
+        func1(src2, buf2, stream);
+
+        cuda::add(buf1, buf2, dst, noArray(), -1, stream);
+    }
+}
+
+PERF_TEST_P(Sz, BufferPool, CUDA_TYPICAL_MAT_SIZES)
+{
+    static bool first = true;
+
+    const Size size = GetParam();
+
+    const bool useBufferPool = PERF_RUN_CUDA();
+
+    Mat host_src(size, CV_8UC1);
+    declare.in(host_src, WARMUP_RNG);
+
+    GpuMat src1(host_src), src2(host_src);
+    GpuMat dst;
+
+    setBufferAllocatorUsage(useBufferPool);
+    if (useBufferPool && first)
+    {
+        allocateMemoryPool(-1, 25 * 1024 * 1024, 2);
+        first = false;
+    }
+
+    TEST_CYCLE()
+    {
+        func2(src1, src2, dst, Stream::Null());
+    }
+
+    Mat h_dst(dst);
+    SANITY_CHECK(h_dst);
+}
+
+#endif
diff --git a/modules/cuda/test/test_buffer_pool.cpp b/modules/cuda/test/test_buffer_pool.cpp
new file mode 100644
index 000000000..ea3ca80e4
--- /dev/null
+++ b/modules/cuda/test/test_buffer_pool.cpp
@@ -0,0 +1,120 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+#include "opencv2/cudaarithm.hpp"
+#include "opencv2/cudawarping.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+using namespace testing;
+using namespace cv;
+using namespace cv::cuda;
+
+struct BufferPool : TestWithParam<DeviceInfo>
+{
+};
+
+namespace
+{
+    void func1(const GpuMat& src, GpuMat& dst, Stream& stream)
+    {
+        BufferAllocator bufAlloc(stream);
+
+        GpuMat buf(&bufAlloc);
+
+        src.convertTo(buf, CV_32F, 1.0 / 255.0, stream);
+
+        cuda::exp(buf, dst, stream);
+    }
+
+    void func2(const GpuMat& src, GpuMat& dst, Stream& stream)
+    {
+        BufferAllocator bufAlloc(stream);
+
+        GpuMat buf1(&bufAlloc);
+
+        cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST, stream);
+
+        GpuMat buf2(&bufAlloc);
+
+        func1(buf1, buf2, stream);
+
+        GpuMat buf3(&bufAlloc);
+
+        cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST, stream);
+
+        buf3.convertTo(dst, CV_8U, stream);
+    }
+}
+
+CUDA_TEST_P(BufferPool, Test)
+{
+    DeviceInfo devInfo = GetParam();
+    setDevice(devInfo.deviceID());
+
+    GpuMat src(200, 200, CV_8UC1);
+    GpuMat dst;
+
+    Stream stream;
+
+    func2(src, dst, stream);
+
+    stream.waitForCompletion();
+
+    GpuMat buf, buf1, buf2, buf3;
+    GpuMat dst_gold;
+
+    cuda::resize(src, buf1, Size(), 0.5, 0.5, cv::INTER_NEAREST);
+    buf1.convertTo(buf, CV_32F, 1.0 / 255.0);
+    cuda::exp(buf, buf2);
+    cuda::resize(buf2, buf3, src.size(), 0, 0, cv::INTER_NEAREST);
+    buf3.convertTo(dst_gold, CV_8U);
+
+    ASSERT_MAT_NEAR(dst_gold, dst, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Stream, BufferPool, ALL_DEVICES);
+
+#endif // HAVE_CUDA