implemented asynchronous call for gpumat::setTo(), gpumat::copyTo(), gpumat::converTo()
This commit is contained in:
parent
1ead3a5b02
commit
769564c130
@ -49,24 +49,24 @@
|
||||
namespace cv
|
||||
{
|
||||
namespace gpu
|
||||
{
|
||||
{
|
||||
//////////////////////////////// Initialization ////////////////////////
|
||||
|
||||
|
||||
//! This is the only function that do not throw exceptions if the library is compiled without Cuda.
|
||||
CV_EXPORTS int getCudaEnabledDeviceCount();
|
||||
|
||||
//! Functions below throw cv::Expception if the library is compiled without Cuda.
|
||||
CV_EXPORTS string getDeviceName(int device);
|
||||
CV_EXPORTS void setDevice(int device);
|
||||
CV_EXPORTS int getDevice();
|
||||
CV_EXPORTS void setDevice(int device);
|
||||
CV_EXPORTS int getDevice();
|
||||
|
||||
CV_EXPORTS void getComputeCapability(int device, int* major, int* minor);
|
||||
CV_EXPORTS int getNumberOfSMs(int device);
|
||||
|
||||
//////////////////////////////// GpuMat ////////////////////////////////
|
||||
class CudaStrem;
|
||||
|
||||
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
|
||||
//////////////////////////////// GpuMat ////////////////////////////////
|
||||
class CudaStream;
|
||||
|
||||
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
|
||||
class CV_EXPORTS GpuMat
|
||||
{
|
||||
public:
|
||||
@ -81,7 +81,7 @@ namespace cv
|
||||
GpuMat(Size _size, int _type, const Scalar& _s);
|
||||
//! copy constructor
|
||||
GpuMat(const GpuMat& m);
|
||||
|
||||
|
||||
//! constructor for GpuMatrix headers pointing to user-allocated data
|
||||
GpuMat(int _rows, int _cols, int _type, void* _data, size_t _step = Mat::AUTO_STEP);
|
||||
GpuMat(Size _size, int _type, void* _data, size_t _step = Mat::AUTO_STEP);
|
||||
@ -89,7 +89,7 @@ namespace cv
|
||||
//! creates a matrix header for a part of the bigger matrix
|
||||
GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange);
|
||||
GpuMat(const GpuMat& m, const Rect& roi);
|
||||
|
||||
|
||||
//! builds GpuMat from Mat. Perfom blocking upload to device.
|
||||
explicit GpuMat (const Mat& m);
|
||||
|
||||
@ -99,7 +99,7 @@ namespace cv
|
||||
//! assignment operators
|
||||
GpuMat& operator = (const GpuMat& m);
|
||||
//! assignment operator. Perfom blocking upload to device.
|
||||
GpuMat& operator = (const Mat& m);
|
||||
GpuMat& operator = (const Mat& m);
|
||||
|
||||
//! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
|
||||
// Contains just image size, data ptr and step.
|
||||
@ -110,7 +110,7 @@ namespace cv
|
||||
|
||||
//! Downloads data from device to host memory. Blocking calls.
|
||||
operator Mat() const;
|
||||
void download(cv::Mat& m) const;
|
||||
void download(cv::Mat& m) const;
|
||||
|
||||
//! returns a new GpuMatrix header for the specified row
|
||||
GpuMat row(int y) const;
|
||||
@ -161,7 +161,7 @@ namespace cv
|
||||
//! extracts a rectangular sub-GpuMatrix
|
||||
// (this is a generalized form of row, rowRange etc.)
|
||||
GpuMat operator()( Range rowRange, Range colRange ) const;
|
||||
GpuMat operator()( const Rect& roi ) const;
|
||||
GpuMat operator()( const Rect& roi ) const;
|
||||
|
||||
//! returns true iff the GpuMatrix data is continuous
|
||||
// (i.e. when there are no gaps between successive rows).
|
||||
@ -222,33 +222,33 @@ namespace cv
|
||||
// Page locked memory is only needed for async and faster coping to GPU.
|
||||
// It is convertable to cv::Mat header without reference counting
|
||||
// so you can use it with other opencv functions.
|
||||
|
||||
|
||||
class CV_EXPORTS MatPL
|
||||
{
|
||||
public:
|
||||
public:
|
||||
|
||||
//Not supported. Now behaviour is like ALLOC_DEFAULT.
|
||||
//enum { ALLOC_DEFAULT = 0, ALLOC_PORTABLE = 1, ALLOC_WRITE_COMBINED = 4 }
|
||||
|
||||
MatPL();
|
||||
MatPL(const MatPL& m);
|
||||
MatPL();
|
||||
MatPL(const MatPL& m);
|
||||
|
||||
MatPL(int _rows, int _cols, int _type);
|
||||
MatPL(Size _size, int _type);
|
||||
MatPL(Size _size, int _type);
|
||||
|
||||
//! creates from cv::Mat with coping data
|
||||
explicit MatPL(const Mat& m);
|
||||
|
||||
~MatPL();
|
||||
|
||||
~MatPL();
|
||||
|
||||
MatPL& operator = (const MatPL& m);
|
||||
|
||||
|
||||
//! returns deep copy of the matrix, i.e. the data is copied
|
||||
MatPL clone() const;
|
||||
|
||||
//! allocates new matrix data unless the matrix already has specified size and type.
|
||||
|
||||
//! allocates new matrix data unless the matrix already has specified size and type.
|
||||
void create(int _rows, int _cols, int _type);
|
||||
void create(Size _size, int _type);
|
||||
void create(Size _size, int _type);
|
||||
|
||||
//! decrements reference counter and released memory if needed.
|
||||
void release();
|
||||
@ -256,25 +256,25 @@ namespace cv
|
||||
//! returns matrix header with disabled reference counting for MatPL data.
|
||||
Mat createMatHeader() const;
|
||||
operator Mat() const;
|
||||
|
||||
|
||||
// Please see cv::Mat for descriptions
|
||||
bool isContinuous() const;
|
||||
size_t elemSize() const;
|
||||
size_t elemSize1() const;
|
||||
int type() const;
|
||||
int depth() const;
|
||||
int channels() const;
|
||||
size_t step1() const;
|
||||
Size size() const;
|
||||
bool isContinuous() const;
|
||||
size_t elemSize() const;
|
||||
size_t elemSize1() const;
|
||||
int type() const;
|
||||
int depth() const;
|
||||
int channels() const;
|
||||
size_t step1() const;
|
||||
Size size() const;
|
||||
bool empty() const;
|
||||
|
||||
|
||||
// Please see cv::Mat for descriptions
|
||||
int flags;
|
||||
int rows, cols;
|
||||
int flags;
|
||||
int rows, cols;
|
||||
size_t step;
|
||||
|
||||
uchar* data;
|
||||
int* refcount;
|
||||
uchar* data;
|
||||
int* refcount;
|
||||
|
||||
uchar* datastart;
|
||||
uchar* dataend;
|
||||
@ -288,37 +288,37 @@ namespace cv
|
||||
class CV_EXPORTS CudaStream
|
||||
{
|
||||
public:
|
||||
CudaStream();
|
||||
CudaStream();
|
||||
~CudaStream();
|
||||
|
||||
CudaStream(const CudaStream&);
|
||||
CudaStream(const CudaStream&);
|
||||
CudaStream& operator=(const CudaStream&);
|
||||
|
||||
bool queryIfComplete();
|
||||
void waitForCompletion();
|
||||
void waitForCompletion();
|
||||
|
||||
//! downloads asynchronously.
|
||||
//! downloads asynchronously.
|
||||
// Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its subMat)
|
||||
void enqueueDownload(const GpuMat& src, MatPL& dst);
|
||||
void enqueueDownload(const GpuMat& src, Mat& dst);
|
||||
|
||||
//! uploads asynchronously.
|
||||
//! uploads asynchronously.
|
||||
// Warning! cv::Mat must point to page locked memory (i.e. to MatPL data or to its ROI)
|
||||
void enqueueUpload(const MatPL& src, GpuMat& dst);
|
||||
void enqueueUpload(const MatPL& src, GpuMat& dst);
|
||||
void enqueueUpload(const Mat& src, GpuMat& dst);
|
||||
|
||||
void enqueueCopy(const GpuMat& src, GpuMat& dst);
|
||||
|
||||
void enqueueMemSet(const GpuMat& src, Scalar val);
|
||||
|
||||
void enqueueMemSet(const GpuMat& src, Scalar val);
|
||||
void enqueueMemSet(const GpuMat& src, Scalar val, const GpuMat& mask);
|
||||
|
||||
// converts matrix type, ex from float to uchar depending on type
|
||||
void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);
|
||||
void enqueueConvert(const GpuMat& src, GpuMat& dst, int type, double a = 1, double b = 0);
|
||||
private:
|
||||
void create();
|
||||
void release();
|
||||
struct Impl;
|
||||
Impl *impl;
|
||||
Impl *impl;
|
||||
friend struct StreamAccessor;
|
||||
};
|
||||
|
||||
@ -348,7 +348,7 @@ namespace cv
|
||||
//! Acync version
|
||||
void operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, const CudaStream& stream);
|
||||
|
||||
//! Some heuristics that tries to estmate
|
||||
//! Some heuristics that tries to estmate
|
||||
// if current GPU will be faster then CPU in this algorithm.
|
||||
// It queries current active device.
|
||||
static bool checkIfGpuCallReasonable();
|
||||
@ -356,11 +356,11 @@ namespace cv
|
||||
int ndisp;
|
||||
int winSize;
|
||||
int preset;
|
||||
|
||||
|
||||
// If avergeTexThreshold == 0 => post procesing is disabled
|
||||
// If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image
|
||||
// SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold
|
||||
// i.e. input left image is low textured.
|
||||
// i.e. input left image is low textured.
|
||||
float avergeTexThreshold;
|
||||
private:
|
||||
GpuMat minSSD, leBuf, riBuf;
|
||||
@ -369,4 +369,4 @@ namespace cv
|
||||
}
|
||||
#include "opencv2/gpu/matrix_operations.hpp"
|
||||
|
||||
#endif /* __OPENCV_GPU_HPP__ */
|
||||
#endif /* __OPENCV_GPU_HPP__ */
|
||||
|
@ -61,12 +61,12 @@ namespace cv
|
||||
{
|
||||
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
|
||||
|
||||
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels);
|
||||
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
|
||||
extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels);
|
||||
extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels);
|
||||
extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream = 0);
|
||||
extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||
|
||||
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta);
|
||||
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream = 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -42,7 +42,6 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
//#include <iostream>
|
||||
#include "cuda_shared.hpp"
|
||||
#include "cuda_runtime.h"
|
||||
|
||||
@ -239,19 +238,27 @@ namespace cv
|
||||
////////////////////////////////// CopyTo /////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels);
|
||||
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
|
||||
|
||||
template<typename T>
|
||||
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels)
|
||||
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(16,16, 1);
|
||||
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
|
||||
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
|
||||
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
{
|
||||
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
|
||||
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
|
||||
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels)
|
||||
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
static CopyToFunc tab[8] =
|
||||
{
|
||||
@ -269,7 +276,7 @@ namespace cv
|
||||
|
||||
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat_src, mat_dst, mask, channels);
|
||||
func(mat_src, mat_dst, mask, channels, stream);
|
||||
}
|
||||
|
||||
|
||||
@ -277,28 +284,43 @@ namespace cv
|
||||
////////////////////////////////// SetTo //////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels);
|
||||
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels);
|
||||
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
|
||||
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
|
||||
|
||||
template <typename T>
|
||||
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels)
|
||||
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
{
|
||||
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void set_to_without_mask_run(const DevMem2D& mat, int channels)
|
||||
void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
dim3 threadsPerBlock(32, 8, 1);
|
||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
if (stream == 0)
|
||||
{
|
||||
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
|
||||
cudaSafeCall ( cudaThreadSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels)
|
||||
extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
double data[4];
|
||||
data[0] = scalar[0];
|
||||
@ -323,11 +345,11 @@ namespace cv
|
||||
|
||||
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat, channels);
|
||||
func(mat, channels, stream);
|
||||
}
|
||||
|
||||
|
||||
extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels)
|
||||
extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||
{
|
||||
double data[4];
|
||||
data[0] = scalar[0];
|
||||
@ -352,7 +374,7 @@ namespace cv
|
||||
|
||||
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||
|
||||
func(mat, mask, channels);
|
||||
func(mat, mask, channels, stream);
|
||||
}
|
||||
|
||||
|
||||
@ -360,22 +382,27 @@ namespace cv
|
||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta);
|
||||
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
|
||||
|
||||
template<typename T, typename DT>
|
||||
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta)
|
||||
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
|
||||
{
|
||||
const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
|
||||
|
||||
dim3 block(32, 8);
|
||||
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
|
||||
if (stream == 0)
|
||||
{
|
||||
::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
else
|
||||
{
|
||||
::mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
|
||||
}
|
||||
}
|
||||
|
||||
::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
|
||||
|
||||
cudaSafeCall( cudaThreadSynchronize() );
|
||||
}
|
||||
|
||||
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta)
|
||||
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
|
||||
{
|
||||
static CvtFunc tab[8][8] =
|
||||
{
|
||||
@ -406,7 +433,7 @@ namespace cv
|
||||
CvtFunc func = tab[sdepth][ddepth];
|
||||
if (func == 0)
|
||||
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||
func(src, dst, width, height, alpha, beta);
|
||||
func(src, dst, width, height, alpha, beta, stream);
|
||||
}
|
||||
} // namespace impl
|
||||
} // namespace gpu
|
||||
|
@ -74,6 +74,7 @@ struct CudaStream::Impl
|
||||
cudaStream_t stream;
|
||||
int ref_counter;
|
||||
};
|
||||
|
||||
namespace
|
||||
{
|
||||
template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
|
||||
@ -147,7 +148,7 @@ void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst)
|
||||
{
|
||||
// if not -> allocation will be done, but after that dst will not point to page locked memory
|
||||
CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() )
|
||||
devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
|
||||
devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user