diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index f3e9a8fad..949acba69 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -284,12 +284,11 @@ CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, co //! returns 2D filter with the specified kernel //! supports CV_8UC1 and CV_8UC4 types -CV_EXPORTS Ptr getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, - Point anchor = Point(-1, -1)); +CV_EXPORTS Ptr getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT); //! returns the non-separable linear filter engine CV_EXPORTS Ptr createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, - const Point& anchor = Point(-1,-1)); + Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT); //! returns the primitive row filter with the specified kernel. //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type. @@ -367,7 +366,7 @@ CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); //! applies non-separable 2D linear filter to the image -CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null()); +CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null()); //! applies separable 2D linear filter to the image CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu index 57cec51cb..79c8fa285 100644 --- a/modules/gpu/src/cuda/imgproc.cu +++ b/modules/gpu/src/cuda/imgproc.cu @@ -900,49 +900,100 @@ namespace cv { namespace gpu { namespace device __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE]; - texture filter2DTex(0, cudaFilterModePoint, cudaAddressModeClamp); - - __global__ void filter2D(int ofsX, int ofsY, PtrStepf dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY, const BrdReflect101 brd) + template + __global__ void filter2D(const SrcT src, DevMem2D_ dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY) { + typedef typename TypeVec::cn>::vec_type sum_t; + const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; - if (x > brd.last_col || y > brd.last_row) + if (x >= dst.cols || y >= dst.rows) return; - float res = 0; + sum_t res = VecTraits::all(0); int kInd = 0; for (int i = 0; i < kHeight; ++i) { for (int j = 0; j < kWidth; ++j) - { - const int srcX = ofsX + brd.idx_col(x - anchorX + j); - const int srcY = ofsY + brd.idx_row(y - anchorY + i); - - res += tex2D(filter2DTex, srcX, srcY) * c_filter2DKernel[kInd++]; - } + res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++]; } - dst.ptr(y)[x] = res; + dst(y, x) = saturate_cast(res); } - void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream) + template class Brd> struct Filter2DCaller; + + #define IMPLEMENT_FILTER2D_TEX_READER(type) \ + texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \ + struct tex_filter2D_ ## type ## _reader \ + { \ + typedef type elem_type; \ + typedef int index_type; \ + const int xoff; \ + const int yoff; \ + tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \ + __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \ + { \ + return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \ + } \ + }; \ + template class Brd> struct Filter2DCaller< type , D, Brd> \ + { \ + static void call(const DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_ dst, \ + int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \ + { \ + typedef typename TypeVec::cn>::vec_type work_type; \ + dim3 block(16, 16); \ + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \ + bindTexture(&tex_filter2D_ ## type , srcWhole); \ + tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \ + Brd brd(dst.rows, dst.cols, VecTraits::make(borderValue)); \ + BorderReader< tex_filter2D_ ## type ##_reader, Brd > brdSrc(texSrc, brd); \ + filter2D<<>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \ + cudaSafeCall( cudaGetLastError() ); \ + if (stream == 0) \ + cudaSafeCall( cudaDeviceSynchronize() ); \ + } \ + }; + + IMPLEMENT_FILTER2D_TEX_READER(uchar); + IMPLEMENT_FILTER2D_TEX_READER(uchar4); + + IMPLEMENT_FILTER2D_TEX_READER(ushort); + IMPLEMENT_FILTER2D_TEX_READER(ushort4); + + IMPLEMENT_FILTER2D_TEX_READER(float); + IMPLEMENT_FILTER2D_TEX_READER(float4); + + #undef IMPLEMENT_FILTER2D_TEX_READER + + template + void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, + int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, + int borderMode, const float* borderValue, cudaStream_t stream) { + typedef void (*func_t)(const DevMem2D_ srcWhole, int xoff, int yoff, DevMem2D_ dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream); + static const func_t funcs[] = + { + Filter2DCaller::call, + Filter2DCaller::call, + Filter2DCaller::call, + Filter2DCaller::call, + Filter2DCaller::call + }; + cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); - const dim3 block(16, 16); - const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); - - bindTexture(&filter2DTex, src); - - BrdReflect101 brd(dst.rows, dst.cols); - - filter2D<<>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY, brd); - cudaSafeCall(cudaGetLastError()); - - if (stream == 0) - cudaSafeCall(cudaDeviceSynchronize()); + funcs[borderMode](static_cast< DevMem2D_ >(srcWhole), ofsX, ofsY, static_cast< DevMem2D_ >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream); } + + template void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream); + template void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream); + template void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream); + template void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream); + template void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream); + template void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream); } // namespace imgproc }}} // namespace cv { namespace gpu { namespace device { diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index ac61d20df..2349857ee 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -58,8 +58,8 @@ Ptr cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Ptr cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } -Ptr cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr(0); } @@ -78,7 +78,7 @@ void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); } void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); } void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); } -void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, Stream&) { throw_nogpu(); } +void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_nogpu(); } void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); } void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_nogpu(); } void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); } @@ -663,7 +663,10 @@ namespace cv { namespace gpu { namespace device { namespace imgproc { - void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream); + template + void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, + int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, + int borderMode, const float* borderValue, cudaStream_t stream); } }}} @@ -705,10 +708,16 @@ namespace nppFilter2D_t func; }; - struct GpuLinearFilter : public BaseFilter_GPU + typedef void (*gpuFilter2D_t)(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, + int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, + int borderMode, const float* borderValue, cudaStream_t stream); + + struct GpuFilter2D : public BaseFilter_GPU { - GpuLinearFilter(Size ksize_, Point anchor_, const GpuMat& kernel_) : - BaseFilter_GPU(ksize_, anchor_), kernel(kernel_) {} + GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) : + BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_) + { + } virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) { @@ -719,30 +728,32 @@ namespace src.locateROI(wholeSize, ofs); GpuMat srcWhole(wholeSize, src.type(), src.datastart); - filter2D_gpu(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr(), StreamAccessor::getStream(stream)); + static const Scalar_ zero = Scalar_::all(0.0f); + func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr(), brd_type, zero.val, StreamAccessor::getStream(stream)); } + gpuFilter2D_t func; GpuMat kernel; + int brd_type; }; } -Ptr cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, Point anchor) +Ptr cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type) { - CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1); + using namespace cv::gpu::device::imgproc; + + int sdepth = CV_MAT_DEPTH(srcType); + int scn = CV_MAT_CN(srcType); + + CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F); + CV_Assert(scn == 1 || scn == 4); CV_Assert(dstType == srcType); + CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP); - if (srcType == CV_32FC1) - { - CV_Assert(ksize.width * ksize.height <= 16 * 16); + Size ksize = kernel.size(); - GpuMat gpu_krnl; - normalizeKernel(kernel, gpu_krnl, CV_32F); - - normalizeAnchor(anchor, ksize); - - return Ptr(new GpuLinearFilter(ksize, anchor, gpu_krnl)); - } - else +#if 0 + if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT) { static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R}; @@ -754,27 +765,64 @@ Ptr cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const return Ptr(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)])); } +#endif + + CV_Assert(ksize.width * ksize.height <= 16 * 16); + + int gpuBorderType; + CV_Assert( tryConvertToGpuBorderType(brd_type, gpuBorderType) ); + + GpuMat gpu_krnl; + normalizeKernel(kernel, gpu_krnl, CV_32F); + + normalizeAnchor(anchor, ksize); + + gpuFilter2D_t func = 0; + + switch (srcType) + { + case CV_8UC1: + func = filter2D_gpu; + break; + case CV_8UC4: + func = filter2D_gpu; + break; + case CV_16UC1: + func = filter2D_gpu; + break; + case CV_16UC4: + func = filter2D_gpu; + break; + case CV_32FC1: + func = filter2D_gpu; + break; + case CV_32FC4: + func = filter2D_gpu; + break; + } + + return Ptr(new GpuFilter2D(ksize, anchor, func, gpu_krnl, gpuBorderType)); } -Ptr cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor) +Ptr cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType) { - Size ksize = kernel.size(); - - Ptr linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor); + Ptr linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType); return createFilter2D_GPU(linearFilter, srcType, dstType); } -void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, Stream& stream) +void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream) { - if( ddepth < 0 ) + if (ddepth < 0) ddepth = src.depth(); - dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); + int dst_type = CV_MAKE_TYPE(ddepth, src.channels()); - Ptr f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor); + Ptr f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType); - f->apply(src, dst, src.type() == CV_32FC1 ? Rect(0, 0, src.cols, src.rows) : Rect(0, 0, -1, -1), stream); + dst.create(src.size(), dst_type); + + f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp index 29c31565f..6f7b8d911 100644 --- a/modules/gpu/test/test_filters.cpp +++ b/modules/gpu/test/test_filters.cpp @@ -505,13 +505,14 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, MorphEx, testing::Combine( ///////////////////////////////////////////////////////////////////////////////////////////////// // Filter2D -PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi) +PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi) { cv::gpu::DeviceInfo devInfo; cv::Size size; int type; cv::Size ksize; cv::Point anchor; + int borderType; bool useRoi; cv::Mat img; @@ -524,7 +525,8 @@ PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, type = GET_PARAM(2); ksize = GET_PARAM(3); anchor = GET_PARAM(4); - useRoi = GET_PARAM(5); + borderType = GET_PARAM(5); + useRoi = GET_PARAM(6); cv::gpu::setDevice(devInfo.deviceID()); } @@ -533,26 +535,24 @@ PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, TEST_P(Filter2D, Accuracy) { cv::Mat src = randomMat(size, type); - cv::Mat kernel = cv::Mat::ones(ksize.height, ksize.width, CV_32FC1); + cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0); cv::gpu::GpuMat dst = createMat(size, type, useRoi); - cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor); + cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType); cv::Mat dst_gold; - cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, type == CV_32FC1 ? cv::BORDER_DEFAULT : cv::BORDER_CONSTANT); + cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType); - if (type == CV_32FC1) - EXPECT_MAT_NEAR(dst_gold, dst, 1e-1); - else - EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 0.0); + EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0); } INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine( ALL_DEVICES, DIFFERENT_SIZES, - testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)), + testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)), testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))), testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))), + testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)), WHOLE_SUBMAT)); } // namespace