used new device layer for cv::gpu::transpose

This commit is contained in:
Vladislav Vinogradov 2013-08-26 10:43:08 +04:00
parent 6dbb32a05d
commit 7b3bbcea71
4 changed files with 84 additions and 129 deletions

View File

@ -63,52 +63,6 @@ void cv::cuda::copyMakeBorder(InputArray, OutputArray, int, int, int, int, int,
#else /* !defined (HAVE_CUDA) */ #else /* !defined (HAVE_CUDA) */
////////////////////////////////////////////////////////////////////////
// transpose
namespace arithm
{
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream);
}
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& _stream)
{
GpuMat src = _src.getGpuMat();
CV_Assert( src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8 );
_dst.create( src.cols, src.rows, src.type() );
GpuMat dst = _dst.getGpuMat();
cudaStream_t stream = StreamAccessor::getStream(_stream);
if (src.elemSize() == 1)
{
NppStreamHandler h(stream);
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
else if (src.elemSize() == 4)
{
arithm::transpose<int>(src, dst, stream);
}
else // if (src.elemSize() == 8)
{
if (!deviceSupports(NATIVE_DOUBLE))
CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
arithm::transpose<double>(src, dst, stream);
}
}
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// flip // flip

View File

@ -40,83 +40,53 @@
// //
//M*/ //M*/
#if !defined CUDA_DISABLER #include "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp" #ifndef HAVE_OPENCV_CUDEV
using namespace cv::cuda; #error "opencv_cudev is required"
using namespace cv::cuda::device;
namespace arithm #else
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
#include "opencv2/core/private.cuda.hpp"
using namespace cv::cudev;
void cv::cuda::transpose(InputArray _src, OutputArray _dst, Stream& stream)
{ {
const int TRANSPOSE_TILE_DIM = 16; GpuMat src = _src.getGpuMat();
const int TRANSPOSE_BLOCK_ROWS = 16;
template <typename T> const size_t elemSize = src.elemSize();
__global__ void transposeKernel(const PtrStepSz<T> src, PtrStep<T> dst)
CV_Assert( elemSize == 1 || elemSize == 4 || elemSize == 8 );
_dst.create( src.cols, src.rows, src.type() );
GpuMat dst = _dst.getGpuMat();
if (elemSize == 1)
{ {
__shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1]; NppStreamHandler h(StreamAccessor::getStream(stream));
int blockIdx_x, blockIdx_y; NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
// do diagonal reordering nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
if (gridDim.x == gridDim.y) dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
if (!stream)
CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
}
else if (elemSize == 4)
{ {
blockIdx_y = blockIdx.x; gridTranspose(globPtr<int>(src), globPtr<int>(dst), stream);
blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
} }
else else // if (elemSize == 8)
{ {
int bid = blockIdx.x + gridDim.x * blockIdx.y; gridTranspose(globPtr<double>(src), globPtr<double>(dst), stream);
blockIdx_y = bid % gridDim.y;
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
} }
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
if (xIndex < src.cols)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.rows)
{
tile[threadIdx.y + i][threadIdx.x] = src(yIndex + i, xIndex);
}
}
}
__syncthreads();
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
if (xIndex < src.rows)
{
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < src.cols)
{
dst(yIndex + i, xIndex) = tile[threadIdx.x][threadIdx.y + i];
}
}
}
}
template <typename T> void transpose(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
{
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
transposeKernel<<<grid, block, 0, stream>>>(src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template void transpose<int>(PtrStepSz<int> src, PtrStepSz<int> dst, cudaStream_t stream);
template void transpose<double>(PtrStepSz<double> src, PtrStepSz<double> dst, cudaStream_t stream);
} }
#endif // CUDA_DISABLER #endif

View File

@ -55,15 +55,12 @@ namespace cv { namespace cudev {
namespace transpose_detail namespace transpose_detail
{ {
const int TRANSPOSE_TILE_DIM = 16; template <int TILE_DIM, int BLOCK_DIM_Y, class SrcPtr, typename DstType>
const int TRANSPOSE_BLOCK_ROWS = 16;
template <class SrcPtr, typename DstType>
__global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols) __global__ void transpose(const SrcPtr src, GlobPtr<DstType> dst, const int rows, const int cols)
{ {
typedef typename PtrTraits<SrcPtr>::value_type src_type; typedef typename PtrTraits<SrcPtr>::value_type src_type;
__shared__ src_type tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM + 1]; __shared__ src_type tile[TILE_DIM][TILE_DIM + 1];
int blockIdx_x, blockIdx_y; int blockIdx_x, blockIdx_y;
@ -80,12 +77,12 @@ namespace transpose_detail
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x; blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
} }
int xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x; int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y; int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
if (xIndex < cols) if (xIndex < cols)
{ {
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS) for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
{ {
if (yIndex + i < rows) if (yIndex + i < rows)
{ {
@ -96,12 +93,12 @@ namespace transpose_detail
__syncthreads(); __syncthreads();
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x; xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y; yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
if (xIndex < rows) if (xIndex < rows)
{ {
for (int i = 0; i < TRANSPOSE_TILE_DIM; i += TRANSPOSE_BLOCK_ROWS) for (int i = 0; i < TILE_DIM; i += BLOCK_DIM_Y)
{ {
if (yIndex + i < cols) if (yIndex + i < cols)
{ {
@ -111,13 +108,13 @@ namespace transpose_detail
} }
} }
template <class SrcPtr, typename DstType> template <class Policy, class SrcPtr, typename DstType>
__host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream) __host__ void transpose(const SrcPtr& src, const GlobPtr<DstType>& dst, int rows, int cols, cudaStream_t stream)
{ {
const dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM); const dim3 block(Policy::tile_dim, Policy::block_dim_y);
const dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
transpose<<<grid, block, 0, stream>>>(src, dst, rows, cols); transpose<Policy::tile_dim, Policy::block_dim_y><<<grid, block, 0, stream>>>(src, dst, rows, cols);
CV_CUDEV_SAFE_CALL( cudaGetLastError() ); CV_CUDEV_SAFE_CALL( cudaGetLastError() );
if (stream == 0) if (stream == 0)

View File

@ -49,19 +49,53 @@
#include "../common.hpp" #include "../common.hpp"
#include "../ptr2d/traits.hpp" #include "../ptr2d/traits.hpp"
#include "../ptr2d/gpumat.hpp" #include "../ptr2d/gpumat.hpp"
#include "../ptr2d/glob.hpp"
#include "detail/transpose.hpp" #include "detail/transpose.hpp"
namespace cv { namespace cudev { namespace cv { namespace cudev {
template <class SrcPtr, typename DstType> template <class Policy, class SrcPtr, typename DstType>
__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null()) __host__ void gridTranspose_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{ {
const int rows = getRows(src); const int rows = getRows(src);
const int cols = getCols(src); const int cols = getCols(src);
dst.create(cols, rows); dst.create(cols, rows);
transpose_detail::transpose(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream)); transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
}
template <class Policy, class SrcPtr, typename DstType>
__host__ void gridTranspose_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
{
const int rows = getRows(src);
const int cols = getCols(src);
CV_Assert( getRows(dst) == cols && getCols(dst) == rows );
transpose_detail::transpose<Policy>(shrinkPtr(src), shrinkPtr(dst), rows, cols, StreamAccessor::getStream(stream));
}
// Default Policy
struct DefaultTransposePolicy
{
enum {
tile_dim = 16,
block_dim_y = 16
};
};
template <class SrcPtr, typename DstType>
__host__ void gridTranspose(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
{
gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
}
template <class SrcPtr, typename DstType>
__host__ void gridTranspose(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
{
gridTranspose_<DefaultTransposePolicy>(src, dst, stream);
} }
}} }}