fixed gpu::filter2D

This commit is contained in:
Vladislav Vinogradov
2012-04-01 07:32:25 +00:00
parent fd31234122
commit 82d619a34d
4 changed files with 167 additions and 69 deletions

View File

@@ -900,49 +900,100 @@ namespace cv { namespace gpu { namespace device
__constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeClamp);
__global__ void filter2D(int ofsX, int ofsY, PtrStepf dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY, const BrdReflect101<float> brd)
template <class SrcT, typename D>
__global__ void filter2D(const SrcT src, DevMem2D_<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
{
typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x > brd.last_col || y > brd.last_row)
if (x >= dst.cols || y >= dst.rows)
return;
float res = 0;
sum_t res = VecTraits<sum_t>::all(0);
int kInd = 0;
for (int i = 0; i < kHeight; ++i)
{
for (int j = 0; j < kWidth; ++j)
{
const int srcX = ofsX + brd.idx_col(x - anchorX + j);
const int srcY = ofsY + brd.idx_row(y - anchorY + i);
res += tex2D(filter2DTex, srcX, srcY) * c_filter2DKernel[kInd++];
}
res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
}
dst.ptr(y)[x] = res;
dst(y, x) = saturate_cast<D>(res);
}
void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream)
template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
#define IMPLEMENT_FILTER2D_TEX_READER(type) \
texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
struct tex_filter2D_ ## type ## _reader \
{ \
typedef type elem_type; \
typedef int index_type; \
const int xoff; \
const int yoff; \
tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
__device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
{ \
return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
} \
}; \
template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
{ \
static void call(const DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_<D> dst, \
int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
{ \
typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
dim3 block(16, 16); \
dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
bindTexture(&tex_filter2D_ ## type , srcWhole); \
tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
cudaSafeCall( cudaGetLastError() ); \
if (stream == 0) \
cudaSafeCall( cudaDeviceSynchronize() ); \
} \
};
IMPLEMENT_FILTER2D_TEX_READER(uchar);
IMPLEMENT_FILTER2D_TEX_READER(uchar4);
IMPLEMENT_FILTER2D_TEX_READER(ushort);
IMPLEMENT_FILTER2D_TEX_READER(ushort4);
IMPLEMENT_FILTER2D_TEX_READER(float);
IMPLEMENT_FILTER2D_TEX_READER(float4);
#undef IMPLEMENT_FILTER2D_TEX_READER
template <typename T, typename D>
void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst,
int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
int borderMode, const float* borderValue, cudaStream_t stream)
{
typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
static const func_t funcs[] =
{
Filter2DCaller<T, D, BrdReflect101>::call,
Filter2DCaller<T, D, BrdReplicate>::call,
Filter2DCaller<T, D, BrdConstant>::call,
Filter2DCaller<T, D, BrdReflect>::call,
Filter2DCaller<T, D, BrdWrap>::call
};
cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
const dim3 block(16, 16);
const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
bindTexture(&filter2DTex, src);
BrdReflect101<float> brd(dst.rows, dst.cols);
filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY, brd);
cudaSafeCall(cudaGetLastError());
if (stream == 0)
cudaSafeCall(cudaDeviceSynchronize());
funcs[borderMode](static_cast< DevMem2D_<T> >(srcWhole), ofsX, ofsY, static_cast< DevMem2D_<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
}
template void filter2D_gpu<uchar, uchar>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
template void filter2D_gpu<uchar4, uchar4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
template void filter2D_gpu<ushort, ushort>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
template void filter2D_gpu<ushort4, ushort4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
template void filter2D_gpu<float, float>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
template void filter2D_gpu<float4, float4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
} // namespace imgproc
}}} // namespace cv { namespace gpu { namespace device {