implemented asynchronous call for gpumat::setTo(), gpumat::copyTo(), gpumat::converTo()
This commit is contained in:
@@ -64,7 +64,7 @@ namespace cv
|
|||||||
CV_EXPORTS int getNumberOfSMs(int device);
|
CV_EXPORTS int getNumberOfSMs(int device);
|
||||||
|
|
||||||
//////////////////////////////// GpuMat ////////////////////////////////
|
//////////////////////////////// GpuMat ////////////////////////////////
|
||||||
class CudaStrem;
|
class CudaStream;
|
||||||
|
|
||||||
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
|
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
|
||||||
class CV_EXPORTS GpuMat
|
class CV_EXPORTS GpuMat
|
||||||
|
@@ -61,12 +61,12 @@ namespace cv
|
|||||||
{
|
{
|
||||||
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
|
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
|
||||||
|
|
||||||
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels);
|
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||||
|
|
||||||
extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels);
|
extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream = 0);
|
||||||
extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels);
|
extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
|
||||||
|
|
||||||
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta);
|
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream = 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -42,7 +42,6 @@
|
|||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
//#include <iostream>
|
|
||||||
#include "cuda_shared.hpp"
|
#include "cuda_shared.hpp"
|
||||||
#include "cuda_runtime.h"
|
#include "cuda_runtime.h"
|
||||||
|
|
||||||
@@ -239,19 +238,27 @@ namespace cv
|
|||||||
////////////////////////////////// CopyTo /////////////////////////////////
|
////////////////////////////////// CopyTo /////////////////////////////////
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels);
|
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels)
|
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||||
{
|
{
|
||||||
dim3 threadsPerBlock(16,16, 1);
|
dim3 threadsPerBlock(16,16, 1);
|
||||||
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
|
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
|
||||||
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
|
if (stream == 0)
|
||||||
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
{
|
||||||
cudaSafeCall ( cudaThreadSynchronize() );
|
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
|
||||||
|
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||||
|
cudaSafeCall ( cudaThreadSynchronize() );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
|
||||||
|
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels)
|
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||||
{
|
{
|
||||||
static CopyToFunc tab[8] =
|
static CopyToFunc tab[8] =
|
||||||
{
|
{
|
||||||
@@ -269,7 +276,7 @@ namespace cv
|
|||||||
|
|
||||||
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||||
|
|
||||||
func(mat_src, mat_dst, mask, channels);
|
func(mat_src, mat_dst, mask, channels, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -277,28 +284,43 @@ namespace cv
|
|||||||
////////////////////////////////// SetTo //////////////////////////////////
|
////////////////////////////////// SetTo //////////////////////////////////
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels);
|
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
|
||||||
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels);
|
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels)
|
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||||
{
|
{
|
||||||
dim3 threadsPerBlock(32, 8, 1);
|
dim3 threadsPerBlock(32, 8, 1);
|
||||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||||
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
|
if (stream == 0)
|
||||||
cudaSafeCall ( cudaThreadSynchronize() );
|
{
|
||||||
|
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||||
|
cudaSafeCall ( cudaThreadSynchronize() );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void set_to_without_mask_run(const DevMem2D& mat, int channels)
|
void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
|
||||||
{
|
{
|
||||||
dim3 threadsPerBlock(32, 8, 1);
|
dim3 threadsPerBlock(32, 8, 1);
|
||||||
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
|
||||||
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
|
if (stream == 0)
|
||||||
cudaSafeCall ( cudaThreadSynchronize() );
|
{
|
||||||
|
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
|
||||||
|
cudaSafeCall ( cudaThreadSynchronize() );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels)
|
extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream)
|
||||||
{
|
{
|
||||||
double data[4];
|
double data[4];
|
||||||
data[0] = scalar[0];
|
data[0] = scalar[0];
|
||||||
@@ -323,11 +345,11 @@ namespace cv
|
|||||||
|
|
||||||
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||||
|
|
||||||
func(mat, channels);
|
func(mat, channels, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels)
|
extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
|
||||||
{
|
{
|
||||||
double data[4];
|
double data[4];
|
||||||
data[0] = scalar[0];
|
data[0] = scalar[0];
|
||||||
@@ -352,7 +374,7 @@ namespace cv
|
|||||||
|
|
||||||
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||||
|
|
||||||
func(mat, mask, channels);
|
func(mat, mask, channels, stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -360,22 +382,27 @@ namespace cv
|
|||||||
//////////////////////////////// ConvertTo ////////////////////////////////
|
//////////////////////////////// ConvertTo ////////////////////////////////
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta);
|
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
|
||||||
|
|
||||||
template<typename T, typename DT>
|
template<typename T, typename DT>
|
||||||
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta)
|
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
|
||||||
{
|
{
|
||||||
const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
|
const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
|
||||||
|
|
||||||
dim3 block(32, 8);
|
dim3 block(32, 8);
|
||||||
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
|
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
|
||||||
|
if (stream == 0)
|
||||||
|
{
|
||||||
|
::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
|
||||||
|
cudaSafeCall( cudaThreadSynchronize() );
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
::mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
|
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
|
||||||
|
|
||||||
cudaSafeCall( cudaThreadSynchronize() );
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta)
|
|
||||||
{
|
{
|
||||||
static CvtFunc tab[8][8] =
|
static CvtFunc tab[8][8] =
|
||||||
{
|
{
|
||||||
@@ -406,7 +433,7 @@ namespace cv
|
|||||||
CvtFunc func = tab[sdepth][ddepth];
|
CvtFunc func = tab[sdepth][ddepth];
|
||||||
if (func == 0)
|
if (func == 0)
|
||||||
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
|
||||||
func(src, dst, width, height, alpha, beta);
|
func(src, dst, width, height, alpha, beta, stream);
|
||||||
}
|
}
|
||||||
} // namespace impl
|
} // namespace impl
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
|
@@ -74,6 +74,7 @@ struct CudaStream::Impl
|
|||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
int ref_counter;
|
int ref_counter;
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
|
template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
|
||||||
@@ -147,7 +148,7 @@ void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst)
|
|||||||
{
|
{
|
||||||
// if not -> allocation will be done, but after that dst will not point to page locked memory
|
// if not -> allocation will be done, but after that dst will not point to page locked memory
|
||||||
CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() )
|
CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() )
|
||||||
devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
|
devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
|
||||||
}
|
}
|
||||||
void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }
|
void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user