From a52af84dcf7f294ffe6605afdf15f90bf2f7968a Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Tue, 16 Apr 2013 14:39:42 +0400 Subject: [PATCH] refactored CudaMem (now alloc type assign only in constructor) --- modules/core/include/opencv2/core/gpu.hpp | 51 +-- modules/core/include/opencv2/core/gpu.inl.hpp | 153 +++++++++ modules/core/src/gpu.cpp | 10 + modules/core/src/gpu_cuda_mem.cpp | 292 +++++------------- modules/core/src/gpu_stream.cpp | 2 +- 5 files changed, 275 insertions(+), 233 deletions(-) diff --git a/modules/core/include/opencv2/core/gpu.hpp b/modules/core/include/opencv2/core/gpu.hpp index cab7df568..c22803bad 100644 --- a/modules/core/include/opencv2/core/gpu.hpp +++ b/modules/core/include/opencv2/core/gpu.hpp @@ -252,66 +252,59 @@ public: uchar* dataend; }; -//! Creates continuous GPU matrix +//! creates continuous GPU matrix CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m); -//! Ensures that size of the given matrix is not less than (rows, cols) size +//! ensures that size of the given matrix is not less than (rows, cols) size //! and matrix type is match specified one too CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m); CV_EXPORTS GpuMat allocMatFromBuf(int rows, int cols, int type, GpuMat& mat); //////////////////////////////// CudaMem //////////////////////////////// + // CudaMem is limited cv::Mat with page locked memory allocation. // Page locked memory is only needed for async and faster coping to GPU. // It is convertable to cv::Mat header without reference counting // so you can use it with other opencv functions. -// Page-locks the matrix m memory and maps it for the device(s) -CV_EXPORTS void registerPageLocked(Mat& m); - -// Unmaps the memory of matrix m, and makes it pageable again. -CV_EXPORTS void unregisterPageLocked(Mat& m); - class CV_EXPORTS CudaMem { public: - enum { ALLOC_PAGE_LOCKED = 1, ALLOC_ZEROCOPY = 2, ALLOC_WRITE_COMBINED = 4 }; + enum AllocType { PAGE_LOCKED = 1, SHARED = 2, WRITE_COMBINED = 4 }; + + explicit CudaMem(AllocType alloc_type = PAGE_LOCKED); - CudaMem(); CudaMem(const CudaMem& m); - CudaMem(int rows, int cols, int type, int _alloc_type = ALLOC_PAGE_LOCKED); - CudaMem(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED); + CudaMem(int rows, int cols, int type, AllocType alloc_type = PAGE_LOCKED); + CudaMem(Size size, int type, AllocType alloc_type = PAGE_LOCKED); - - //! creates from cv::Mat with coping data - explicit CudaMem(const Mat& m, int alloc_type = ALLOC_PAGE_LOCKED); + //! creates from host memory with coping data + explicit CudaMem(InputArray arr, AllocType alloc_type = PAGE_LOCKED); ~CudaMem(); - CudaMem& operator = (const CudaMem& m); + CudaMem& operator =(const CudaMem& m); + + //! swaps with other smart pointer + void swap(CudaMem& b); //! returns deep copy of the matrix, i.e. the data is copied CudaMem clone() const; //! allocates new matrix data unless the matrix already has specified size and type. - void create(int rows, int cols, int type, int alloc_type = ALLOC_PAGE_LOCKED); - void create(Size size, int type, int alloc_type = ALLOC_PAGE_LOCKED); + void create(int rows, int cols, int type); + void create(Size size, int type); //! decrements reference counter and released memory if needed. void release(); //! returns matrix header with disabled reference counting for CudaMem data. Mat createMatHeader() const; - operator Mat() const; //! maps host memory into device address space and returns GpuMat header for it. Throws exception if not supported by hardware. GpuMat createGpuMatHeader() const; - operator GpuMat() const; - - //returns if host memory can be mapperd to gpu address space; - static bool canMapHostMemory(); // Please see cv::Mat for descriptions bool isContinuous() const; @@ -324,7 +317,6 @@ public: Size size() const; bool empty() const; - // Please see cv::Mat for descriptions int flags; int rows, cols; @@ -336,9 +328,14 @@ public: uchar* datastart; uchar* dataend; - int alloc_type; + AllocType alloc_type; }; +//! page-locks the matrix m memory and maps it for the device(s) +CV_EXPORTS void registerPageLocked(Mat& m); + +//! unmaps the memory of matrix m, and makes it pageable again +CV_EXPORTS void unregisterPageLocked(Mat& m); //////////////////////////////// CudaStream //////////////////////////////// // Encapculates Cuda Stream. Provides interface for async coping. @@ -480,6 +477,10 @@ public: // Checks whether the GPU module can be run on the given device bool isCompatible() const; + bool canMapHostMemory() const; + + size_t textureAlignment() const; + int deviceID() const { return device_id_; } private: diff --git a/modules/core/include/opencv2/core/gpu.inl.hpp b/modules/core/include/opencv2/core/gpu.inl.hpp index acc1f2dea..10b8ff594 100644 --- a/modules/core/include/opencv2/core/gpu.inl.hpp +++ b/modules/core/include/opencv2/core/gpu.inl.hpp @@ -373,8 +373,161 @@ void swap(GpuMat& a, GpuMat& b) a.swap(b); } +//////////////////////////////// CudaMem //////////////////////////////// + +inline +CudaMem::CudaMem(AllocType alloc_type_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_) +{ +} + +inline +CudaMem::CudaMem(const CudaMem& m) + : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type) +{ + if( refcount ) + CV_XADD(refcount, 1); +} + +inline +CudaMem::CudaMem(int rows_, int cols_, int type_, AllocType alloc_type_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_) +{ + if (rows_ > 0 && cols_ > 0) + create(rows_, cols_, type_); +} + +inline +CudaMem::CudaMem(Size size_, int type_, AllocType alloc_type_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_) +{ + if (size_.height > 0 && size_.width > 0) + create(size_.height, size_.width, type_); +} + +inline +CudaMem::CudaMem(InputArray arr, AllocType alloc_type_) + : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(alloc_type_) +{ + arr.getMat().copyTo(*this); +} + +inline +CudaMem::~CudaMem() +{ + release(); +} + +inline +CudaMem& CudaMem::operator =(const CudaMem& m) +{ + if (this != &m) + { + CudaMem temp(m); + swap(temp); + } + + return *this; +} + +inline +void CudaMem::swap(CudaMem& b) +{ + std::swap(flags, b.flags); + std::swap(rows, b.rows); + std::swap(cols, b.cols); + std::swap(step, b.step); + std::swap(data, b.data); + std::swap(datastart, b.datastart); + std::swap(dataend, b.dataend); + std::swap(refcount, b.refcount); + std::swap(alloc_type, b.alloc_type); +} + +inline +CudaMem CudaMem::clone() const +{ + CudaMem m(size(), type(), alloc_type); + createMatHeader().copyTo(m); + return m; +} + +inline +void CudaMem::create(Size size_, int type_) +{ + create(size_.height, size_.width, type_); +} + +inline +Mat CudaMem::createMatHeader() const +{ + return Mat(size(), type(), data, step); +} + +inline +bool CudaMem::isContinuous() const +{ + return (flags & Mat::CONTINUOUS_FLAG) != 0; +} + +inline +size_t CudaMem::elemSize() const +{ + return CV_ELEM_SIZE(flags); +} + +inline +size_t CudaMem::elemSize1() const +{ + return CV_ELEM_SIZE1(flags); +} + +inline +int CudaMem::type() const +{ + return CV_MAT_TYPE(flags); +} + +inline +int CudaMem::depth() const +{ + return CV_MAT_DEPTH(flags); +} + +inline +int CudaMem::channels() const +{ + return CV_MAT_CN(flags); +} + +inline +size_t CudaMem::step1() const +{ + return step / elemSize1(); +} + +inline +Size CudaMem::size() const +{ + return Size(cols, rows); +} + +inline +bool CudaMem::empty() const +{ + return data == 0; +} + +static inline +void swap(CudaMem& a, CudaMem& b) +{ + a.swap(b); +} + }} // namespace cv { namespace gpu +//////////////////////////////// Mat //////////////////////////////// + namespace cv { inline diff --git a/modules/core/src/gpu.cpp b/modules/core/src/gpu.cpp index ce3808805..2216ec70c 100644 --- a/modules/core/src/gpu.cpp +++ b/modules/core/src/gpu.cpp @@ -317,6 +317,16 @@ size_t cv::gpu::DeviceInfo::sharedMemPerBlock() const return deviceProps.get(device_id_)->sharedMemPerBlock; } +bool cv::gpu::DeviceInfo::canMapHostMemory() const +{ + return deviceProps.get(device_id_)->canMapHostMemory != 0; +} + +size_t cv::gpu::DeviceInfo::textureAlignment() const +{ + return deviceProps.get(device_id_)->textureAlignment; +} + void cv::gpu::DeviceInfo::queryMemory(size_t& _totalMemory, size_t& _freeMemory) const { int prevDeviceID = getDevice(); diff --git a/modules/core/src/gpu_cuda_mem.cpp b/modules/core/src/gpu_cuda_mem.cpp index 723c38aa0..3681fd77f 100644 --- a/modules/core/src/gpu_cuda_mem.cpp +++ b/modules/core/src/gpu_cuda_mem.cpp @@ -7,11 +7,12 @@ // copy or use the software. // // -// License Agreement +// License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -45,217 +46,70 @@ using namespace cv; using namespace cv::gpu; -cv::gpu::CudaMem::CudaMem() - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) -{ -} - -cv::gpu::CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) -{ - if( _rows > 0 && _cols > 0 ) - create( _rows, _cols, _type, _alloc_type); -} - -cv::gpu::CudaMem::CudaMem(Size _size, int _type, int _alloc_type) - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) -{ - if( _size.height > 0 && _size.width > 0 ) - create( _size.height, _size.width, _type, _alloc_type); -} - -cv::gpu::CudaMem::CudaMem(const CudaMem& m) - : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type) -{ - if( refcount ) - CV_XADD(refcount, 1); -} - -cv::gpu::CudaMem::CudaMem(const Mat& m, int _alloc_type) - : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) -{ - if( m.rows > 0 && m.cols > 0 ) - create( m.size(), m.type(), _alloc_type); - - Mat tmp = createMatHeader(); - m.copyTo(tmp); -} - -cv::gpu::CudaMem::~CudaMem() -{ - release(); -} - -CudaMem& cv::gpu::CudaMem::operator = (const CudaMem& m) -{ - if( this != &m ) - { - if( m.refcount ) - CV_XADD(m.refcount, 1); - release(); - flags = m.flags; - rows = m.rows; cols = m.cols; - step = m.step; data = m.data; - datastart = m.datastart; - dataend = m.dataend; - refcount = m.refcount; - alloc_type = m.alloc_type; - } - return *this; -} - -CudaMem cv::gpu::CudaMem::clone() const -{ - CudaMem m(size(), type(), alloc_type); - Mat to = m; - Mat from = *this; - from.copyTo(to); - return m; -} - -void cv::gpu::CudaMem::create(Size _size, int _type, int _alloc_type) -{ - create(_size.height, _size.width, _type, _alloc_type); -} - -Mat cv::gpu::CudaMem::createMatHeader() const -{ - return Mat(size(), type(), data, step); -} - -cv::gpu::CudaMem::operator Mat() const -{ - return createMatHeader(); -} - -cv::gpu::CudaMem::operator GpuMat() const -{ - return createGpuMatHeader(); -} - -bool cv::gpu::CudaMem::isContinuous() const -{ - return (flags & Mat::CONTINUOUS_FLAG) != 0; -} - -size_t cv::gpu::CudaMem::elemSize() const -{ - return CV_ELEM_SIZE(flags); -} - -size_t cv::gpu::CudaMem::elemSize1() const -{ - return CV_ELEM_SIZE1(flags); -} - -int cv::gpu::CudaMem::type() const -{ - return CV_MAT_TYPE(flags); -} - -int cv::gpu::CudaMem::depth() const -{ - return CV_MAT_DEPTH(flags); -} - -int cv::gpu::CudaMem::channels() const -{ - return CV_MAT_CN(flags); -} - -size_t cv::gpu::CudaMem::step1() const -{ - return step/elemSize1(); -} - -Size cv::gpu::CudaMem::size() const -{ - return Size(cols, rows); -} - -bool cv::gpu::CudaMem::empty() const -{ - return data == 0; -} - -#if !defined (HAVE_CUDA) - -void cv::gpu::registerPageLocked(Mat&) { throw_no_cuda(); } -void cv::gpu::unregisterPageLocked(Mat&) { throw_no_cuda(); } -void cv::gpu::CudaMem::create(int, int, int, int) { throw_no_cuda(); } -bool cv::gpu::CudaMem::canMapHostMemory() { throw_no_cuda(); return false; } -void cv::gpu::CudaMem::release() { throw_no_cuda(); } -GpuMat cv::gpu::CudaMem::createGpuMatHeader () const { throw_no_cuda(); return GpuMat(); } - -#else /* !defined (HAVE_CUDA) */ - -void cv::gpu::registerPageLocked(Mat& m) -{ - cudaSafeCall( cudaHostRegister(m.ptr(), m.step * m.rows, cudaHostRegisterPortable) ); -} - -void cv::gpu::unregisterPageLocked(Mat& m) -{ - cudaSafeCall( cudaHostUnregister(m.ptr()) ); -} - -bool cv::gpu::CudaMem::canMapHostMemory() -{ - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, getDevice()) ); - return (prop.canMapHostMemory != 0) ? true : false; -} - namespace { size_t alignUpStep(size_t what, size_t alignment) { - size_t alignMask = alignment-1; + size_t alignMask = alignment - 1; size_t inverseAlignMask = ~alignMask; size_t res = (what + alignMask) & inverseAlignMask; return res; } } -void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type) +void cv::gpu::CudaMem::create(int rows_, int cols_, int type_) { - if (_alloc_type == ALLOC_ZEROCOPY && !canMapHostMemory()) - CV_Error(cv::Error::GpuApiCallError, "ZeroCopy is not supported by current device"); - - _type &= Mat::TYPE_MASK; - if( rows == _rows && cols == _cols && type() == _type && data ) - return; - if( data ) - release(); - CV_DbgAssert( _rows >= 0 && _cols >= 0 ); - if( _rows > 0 && _cols > 0 ) +#ifndef HAVE_CUDA + (void) rows_; + (void) cols_; + (void) type_; + throw_no_cuda(); +#else + if (alloc_type == SHARED) { - flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + _type; - rows = _rows; - cols = _cols; - step = elemSize()*cols; - if (_alloc_type == ALLOC_ZEROCOPY) + DeviceInfo devInfo; + CV_Assert( devInfo.canMapHostMemory() ); + } + + type_ &= Mat::TYPE_MASK; + + if (rows == rows_ && cols == cols_ && type() == type_ && data) + return; + + if (data) + release(); + + CV_DbgAssert( rows_ >= 0 && cols_ >= 0 ); + + if (rows_ > 0 && cols_ > 0) + { + flags = Mat::MAGIC_VAL + Mat::CONTINUOUS_FLAG + type_; + rows = rows_; + cols = cols_; + step = elemSize() * cols; + + if (alloc_type == SHARED) { - cudaDeviceProp prop; - cudaSafeCall( cudaGetDeviceProperties(&prop, getDevice()) ); - step = alignUpStep(step, prop.textureAlignment); + DeviceInfo devInfo; + step = alignUpStep(step, devInfo.textureAlignment()); } + int64 _nettosize = (int64)step*rows; size_t nettosize = (size_t)_nettosize; - if( _nettosize != (int64)nettosize ) - CV_Error(CV_StsNoMem, "Too big buffer is allocated"); + + if (_nettosize != (int64)nettosize) + CV_Error(cv::Error::StsNoMem, "Too big buffer is allocated"); + size_t datasize = alignSize(nettosize, (int)sizeof(*refcount)); - //datastart = data = (uchar*)fastMalloc(datasize + sizeof(*refcount)); - alloc_type = _alloc_type; - void *ptr = 0; + void* ptr = 0; switch (alloc_type) { - case ALLOC_PAGE_LOCKED: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocDefault) ); break; - case ALLOC_ZEROCOPY: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocMapped) ); break; - case ALLOC_WRITE_COMBINED: cudaSafeCall( cudaHostAlloc( &ptr, datasize, cudaHostAllocWriteCombined) ); break; - default: CV_Error(cv::Error::StsBadFlag, "Invalid alloc type"); + case PAGE_LOCKED: cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocDefault) ); break; + case SHARED: cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocMapped) ); break; + case WRITE_COMBINED: cudaSafeCall( cudaHostAlloc(&ptr, datasize, cudaHostAllocWriteCombined) ); break; + default: CV_Error(cv::Error::StsBadFlag, "Invalid alloc type"); } datastart = data = (uchar*)ptr; @@ -264,31 +118,55 @@ void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type) refcount = (int*)cv::fastMalloc(sizeof(*refcount)); *refcount = 1; } -} - -GpuMat cv::gpu::CudaMem::createGpuMatHeader () const -{ - CV_Assert( alloc_type == ALLOC_ZEROCOPY ); - - GpuMat res; - - void *pdev; - cudaSafeCall( cudaHostGetDevicePointer( &pdev, data, 0 ) ); - res = GpuMat(rows, cols, type(), pdev, step); - - return res; +#endif } void cv::gpu::CudaMem::release() { - if( refcount && CV_XADD(refcount, -1) == 1 ) +#ifdef HAVE_CUDA + if (refcount && CV_XADD(refcount, -1) == 1) { - cudaSafeCall( cudaFreeHost(datastart ) ); + cudaFreeHost(datastart); fastFree(refcount); } + data = datastart = dataend = 0; step = rows = cols = 0; refcount = 0; +#endif } -#endif /* !defined (HAVE_CUDA) */ +GpuMat cv::gpu::CudaMem::createGpuMatHeader() const +{ +#ifndef HAVE_CUDA + throw_no_cuda(); + return GpuMat(); +#else + CV_Assert( alloc_type == SHARED ); + + void *pdev; + cudaSafeCall( cudaHostGetDevicePointer(&pdev, data, 0) ); + + return GpuMat(rows, cols, type(), pdev, step); +#endif +} + +void cv::gpu::registerPageLocked(Mat& m) +{ +#ifndef HAVE_CUDA + (void) m; + throw_no_cuda(); +#else + CV_Assert( m.isContinuous() ); + cudaSafeCall( cudaHostRegister(m.data, m.step * m.rows, cudaHostRegisterPortable) ); +#endif +} + +void cv::gpu::unregisterPageLocked(Mat& m) +{ +#ifndef HAVE_CUDA + (void) m; +#else + cudaSafeCall( cudaHostUnregister(m.data) ); +#endif +} diff --git a/modules/core/src/gpu_stream.cpp b/modules/core/src/gpu_stream.cpp index 4a911fe40..251e3a2aa 100644 --- a/modules/core/src/gpu_stream.cpp +++ b/modules/core/src/gpu_stream.cpp @@ -145,7 +145,7 @@ void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst) void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { - dst.create(src.size(), src.type(), CudaMem::ALLOC_PAGE_LOCKED); + dst.create(src.size(), src.type()); cudaStream_t stream = Impl::getStream(impl); size_t bwidth = src.cols * src.elemSize();