implemented asynchronous call for gpumat::setTo(), gpumat::copyTo(), gpumat::converTo()

This commit is contained in:
Andrey Morozov
2010-07-26 11:22:16 +00:00
parent 1ead3a5b02
commit 769564c130
4 changed files with 114 additions and 86 deletions

View File

@@ -64,7 +64,7 @@ namespace cv
CV_EXPORTS int getNumberOfSMs(int device); CV_EXPORTS int getNumberOfSMs(int device);
//////////////////////////////// GpuMat //////////////////////////////// //////////////////////////////// GpuMat ////////////////////////////////
class CudaStrem; class CudaStream;
//! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat. //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
class CV_EXPORTS GpuMat class CV_EXPORTS GpuMat

View File

@@ -61,12 +61,12 @@ namespace cv
{ {
static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; } static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels); extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels); extern "C" void set_to_without_mask (const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream = 0);
extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels); extern "C" void set_to_with_mask (const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta); extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream = 0);
} }
} }
} }

View File

@@ -42,7 +42,6 @@
#include <stddef.h> #include <stddef.h>
#include <stdio.h> #include <stdio.h>
//#include <iostream>
#include "cuda_shared.hpp" #include "cuda_shared.hpp"
#include "cuda_runtime.h" #include "cuda_runtime.h"
@@ -239,19 +238,27 @@ namespace cv
////////////////////////////////// CopyTo ///////////////////////////////// ////////////////////////////////// CopyTo /////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels); typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T> template<typename T>
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels) void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{ {
dim3 threadsPerBlock(16,16, 1); dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>> if (stream == 0)
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels); {
cudaSafeCall ( cudaThreadSynchronize() ); ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
}
} }
extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels) extern "C" void copy_to_with_mask(const DevMem2D& mat_src, const DevMem2D& mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{ {
static CopyToFunc tab[8] = static CopyToFunc tab[8] =
{ {
@@ -269,7 +276,7 @@ namespace cv
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels); func(mat_src, mat_dst, mask, channels, stream);
} }
@@ -277,28 +284,43 @@ namespace cv
////////////////////////////////// SetTo ////////////////////////////////// ////////////////////////////////// SetTo //////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels); typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels); typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
template <typename T> template <typename T>
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels) void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{ {
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step); if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); {
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
}
} }
template <typename T> template <typename T>
void set_to_without_mask_run(const DevMem2D& mat, int channels) void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
{ {
dim3 threadsPerBlock(32, 8, 1); dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1); dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels); if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() ); {
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall ( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
}
} }
extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels) extern "C" void set_to_without_mask(const DevMem2D& mat, int depth, const double * scalar, int channels, const cudaStream_t & stream)
{ {
double data[4]; double data[4];
data[0] = scalar[0]; data[0] = scalar[0];
@@ -323,11 +345,11 @@ namespace cv
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(mat, channels); func(mat, channels, stream);
} }
extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels) extern "C" void set_to_with_mask(const DevMem2D& mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{ {
double data[4]; double data[4];
data[0] = scalar[0]; data[0] = scalar[0];
@@ -352,7 +374,7 @@ namespace cv
if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); if (func == 0) cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(mat, mask, channels); func(mat, mask, channels, stream);
} }
@@ -360,22 +382,27 @@ namespace cv
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta); typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
template<typename T, typename DT> template<typename T, typename DT>
void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta) void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
{ {
const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift; const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid(divUp(width, block.x * shift), divUp(height, block.y)); dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
if (stream == 0)
{
::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
::mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
}
}
::mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta); extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
cudaSafeCall( cudaThreadSynchronize() );
}
extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, size_t width, size_t height, double alpha, double beta)
{ {
static CvtFunc tab[8][8] = static CvtFunc tab[8][8] =
{ {
@@ -406,7 +433,7 @@ namespace cv
CvtFunc func = tab[sdepth][ddepth]; CvtFunc func = tab[sdepth][ddepth];
if (func == 0) if (func == 0)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__); cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(src, dst, width, height, alpha, beta); func(src, dst, width, height, alpha, beta, stream);
} }
} // namespace impl } // namespace impl
} // namespace gpu } // namespace gpu

View File

@@ -74,6 +74,7 @@ struct CudaStream::Impl
cudaStream_t stream; cudaStream_t stream;
int ref_counter; int ref_counter;
}; };
namespace namespace
{ {
template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k) template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
@@ -147,7 +148,7 @@ void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, Mat& dst)
{ {
// if not -> allocation will be done, but after that dst will not point to page locked memory // if not -> allocation will be done, but after that dst will not point to page locked memory
CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() ) CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() )
devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost);
} }
void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); } void cv::gpu::CudaStream::enqueueDownload(const GpuMat& src, MatPL& dst) { devcopy(src, dst, impl->stream, cudaMemcpyDeviceToHost); }