added custom implementation for nearest and linear interpolation

This commit is contained in:
Vladislav Vinogradov 2013-08-22 12:31:10 +04:00
parent 3b05acf936
commit 241cc417f9

View File

@ -54,17 +54,68 @@ namespace cv { namespace gpu { namespace device
{ {
namespace imgproc namespace imgproc
{ {
template <typename T> __global__ void resize_nearest(const PtrStep<T> src, const float fx, const float fy, PtrStepSz<T> dst)
{
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
if (dst_x < dst.cols && dst_y < dst.rows)
{
const float src_x = dst_x * fx;
const float src_y = dst_y * fy;
dst(dst_y, dst_x) = src(__float2int_rz(src_y), __float2int_rz(src_x));
}
}
template <typename T> __global__ void resize_linear(const PtrStepSz<T> src, const float fx, const float fy, PtrStepSz<T> dst)
{
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
if (dst_x < dst.cols && dst_y < dst.rows)
{
const float src_x = dst_x * fx;
const float src_y = dst_y * fy;
work_type out = VecTraits<work_type>::all(0);
const int x1 = __float2int_rd(src_x);
const int y1 = __float2int_rd(src_y);
const int x2 = x1 + 1;
const int y2 = y1 + 1;
const int x2_read = ::min(x2, src.cols - 1);
const int y2_read = ::min(y2, src.rows - 1);
T src_reg = src(y1, x1);
out = out + src_reg * ((x2 - src_x) * (y2 - src_y));
src_reg = src(y1, x2_read);
out = out + src_reg * ((src_x - x1) * (y2 - src_y));
src_reg = src(y2_read, x1);
out = out + src_reg * ((x2 - src_x) * (src_y - y1));
src_reg = src(y2_read, x2_read);
out = out + src_reg * ((src_x - x1) * (src_y - y1));
dst(dst_y, dst_x) = saturate_cast<T>(out);
}
}
template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, const float fx, const float fy, PtrStepSz<T> dst) template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, const float fx, const float fy, PtrStepSz<T> dst)
{ {
const int x = blockDim.x * blockIdx.x + threadIdx.x; const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y; const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < dst.cols && y < dst.rows) if (dst_x < dst.cols && dst_y < dst.rows)
{ {
const float xcoo = x * fx; const float src_x = dst_x * fx;
const float ycoo = y * fy; const float src_y = dst_y * fy;
dst(y, x) = saturate_cast<T>(src(ycoo, xcoo)); dst(dst_y, dst_x) = src(src_y, src_x);
} }
} }
@ -77,12 +128,34 @@ namespace cv { namespace gpu { namespace device
BrdReplicate<T> brd(src.rows, src.cols); BrdReplicate<T> brd(src.rows, src.cols);
BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd); BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy); Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst); resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
} }
}; };
template <typename T> struct ResizeDispatcherStream<PointFilter, T>
{
static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
resize_nearest<<<grid, block, 0, stream>>>(src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <typename T> struct ResizeDispatcherStream<LinearFilter, T>
{
static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
resize_linear<<<grid, block, 0, stream>>>(src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
}
};
template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
{ {
@ -101,6 +174,32 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaDeviceSynchronize() ); cudaSafeCall( cudaDeviceSynchronize() );
} }
}; };
template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
{
static void call(PtrStepSz<T> src, PtrStepSz<T>, int, int, float fx, float fy, PtrStepSz<T> dst)
{
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
resize_nearest<<<grid, block>>>(src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T> struct ResizeDispatcherNonStream<LinearFilter, T>
{
static void call(PtrStepSz<T> src, PtrStepSz<T>, int, int, float fx, float fy, PtrStepSz<T> dst)
{
const dim3 block(32, 8);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
resize_linear<<<grid, block>>>(src, fx, fy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
#define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \ #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
@ -140,6 +239,58 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall( cudaGetLastError() ); \ cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \ cudaSafeCall( cudaDeviceSynchronize() ); \
} \ } \
}; \
template <> struct ResizeDispatcherNonStream<PointFilter, type > \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
{ \
const dim3 block(32, 8); \
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_resize_ ## type, srcWhole); \
tex_resize_ ## type ## _reader texSrc; \
texSrc.xoff = xoff; \
texSrc.yoff = yoff; \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
PointFilter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
} \
else \
{ \
BrdReplicate< type > brd(src.rows, src.cols); \
BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
PointFilter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; \
template <> struct ResizeDispatcherNonStream<LinearFilter, type > \
{ \
static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
{ \
const dim3 block(32, 8); \
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_resize_ ## type, srcWhole); \
tex_resize_ ## type ## _reader texSrc; \
texSrc.xoff = xoff; \
texSrc.yoff = yoff; \
if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
{ \
LinearFilter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
} \
else \
{ \
BrdReplicate< type > brd(src.rows, src.cols); \
BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
LinearFilter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
} \
cudaSafeCall( cudaGetLastError() ); \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
}; };
OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar) OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
@ -180,7 +331,7 @@ namespace cv { namespace gpu { namespace device
if (x < dst.cols && y < dst.rows) if (x < dst.cols && y < dst.rows)
{ {
dst(y, x) = saturate_cast<T>(src(y, x)); dst(y, x) = src(y, x);
} }
} }
@ -227,7 +378,8 @@ namespace cv { namespace gpu { namespace device
ResizeDispatcher<CubicFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call,
ResizeAreaDispatcher<T>::call ResizeAreaDispatcher<T>::call
}; };
// chenge to linear if area interpolation upscaling
// change to linear if area interpolation upscaling
if (interpolation == 3 && (fx <= 1.f || fy <= 1.f)) if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
interpolation = 1; interpolation = 1;