diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 04a8385f0..dba9b30d8 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -340,6 +340,8 @@ namespace cv //! returns the separable filter engine with the specified filters CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, const Ptr& columnFilter, int srcType, int bufType, int dstType); + CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr& rowFilter, + const Ptr& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf); //! returns horizontal 1D box filter //! supports only CV_8UC1 source type and CV_32FC1 sum type @@ -367,6 +369,8 @@ namespace cv //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported. CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1); + CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, + const Point& anchor = Point(-1,-1), int iterations = 1); //! returns 2D filter with the specified kernel //! supports CV_8UC1 and CV_8UC4 types @@ -386,7 +390,7 @@ namespace cv //! OpenCV version supports only CV_32F as buffer depth and //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. CV_EXPORTS Ptr getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, - int anchor = -1, int borderType = BORDER_CONSTANT); + int anchor = -1, int borderType = BORDER_DEFAULT); //! returns the primitive column filter with the specified kernel. //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type. @@ -397,20 +401,27 @@ namespace cv //! OpenCV version supports only CV_32F as buffer depth and //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. CV_EXPORTS Ptr getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, - int anchor = -1, int borderType = BORDER_CONSTANT); + int anchor = -1, int borderType = BORDER_DEFAULT); //! returns the separable linear filter engine CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, + const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, + int columnBorderType = -1); //! returns filter engine for the generalized Sobel operator CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + CV_EXPORTS Ptr createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! returns the Gaussian filter engine CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); //! returns maximum filter CV_EXPORTS Ptr getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); @@ -426,31 +437,42 @@ namespace cv static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); } //! erodes the image (applies the local minimum operator) - CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); + CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); //! dilates the image (applies the local maximum operator) - CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); + CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); //! applies an advanced morphological operation to the image - CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); + CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1); + CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); //! applies non-separable 2D linear filter to the image CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null()); //! applies separable 2D linear filter to the image CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, + Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! applies generalized Sobel operator to the image CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! applies the vertical or horizontal Scharr operator to the image CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! smooths the image using Gaussian filter. CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, + int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); + CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); //! applies Laplacian operator to the image diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp index d40f07001..55125f31b 100644 --- a/modules/gpu/perf/perf_filters.cpp +++ b/modules/gpu/perf/perf_filters.cpp @@ -101,17 +101,15 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test SANITY_CHECK(dst_host); } -PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()), - testing::Values(GPU_TYPICAL_MAT_SIZES), - testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1), - testing::Values(3, 5), - testing::Values((int)BORDER_REFLECT101, (int)BORDER_CONSTANT))) +PERF_TEST_P(DevInfo_Size_MatType_KernelSize, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()), + testing::Values(GPU_TYPICAL_MAT_SIZES), + testing::Values(CV_8UC1, CV_8UC4, CV_32FC1), + testing::Values(3, 5))) { DeviceInfo devInfo = std::tr1::get<0>(GetParam()); Size size = std::tr1::get<1>(GetParam()); int type = std::tr1::get<2>(GetParam()); int ksize = std::tr1::get<3>(GetParam()); - int borderMode = std::tr1::get<4>(GetParam()); setDevice(devInfo.deviceID()); @@ -123,7 +121,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, t GpuMat dst(size, type); Mat kernel = getGaussianKernel(ksize, 0.5, CV_32F); - Ptr filter = createSeparableLinearFilter_GPU(type, type, kernel, kernel, Point(-1,-1), borderMode); + Ptr filter = createSeparableLinearFilter_GPU(type, type, kernel, kernel, Point(-1,-1)); declare.time(1.0).iterations(100); diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu index b2eaa7200..669c2c09a 100644 --- a/modules/gpu/src/cuda/column_filter.cu +++ b/modules/gpu/src/cuda/column_filter.cu @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -51,49 +52,64 @@ using namespace cv::gpu::device; #define MAX_KERNEL_SIZE 16 #define BLOCK_DIM_X 16 -#define BLOCK_DIM_Y 16 +#define BLOCK_DIM_Y 8 +#define RESULT_STEPS 8 +#define HALO_STEPS 1 -namespace filter_krnls_column +namespace filter_column { - __constant__ float cLinearKernel[MAX_KERNEL_SIZE]; + __constant__ float c_kernel[MAX_KERNEL_SIZE]; - void loadLinearKernel(const float kernel[], int ksize) + void loadKernel(const float kernel[], int ksize) { - cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); } - template + template __global__ void linearColumnFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) { - __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; + typedef typename TypeVec::cn>::vec_type sum_t; - const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; - const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; + __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1]; - T* sDataColumn = smem + threadIdx.x; + //Offset to the upper halo edge + const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; + const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y; if (x < src.cols) { - const T* srcCol = src.ptr() + x; + const T* src_col = src.ptr() + x; - sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step); - sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step); - sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step); + //Main data + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); + + //Upper halo + #pragma unroll + for(int i = 0; i < HALO_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step); + + //Lower halo + #pragma unroll + for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step); __syncthreads(); - if (y < src.rows) + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) { - typedef typename TypeVec::cn>::vec_type sum_t; sum_t sum = VecTraits::all(0); - sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X; - #pragma unroll - for(int i = 0; i < ksize; ++i) - sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i]; + for(int j = 0; j < KERNEL_SIZE; ++j) + sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j]; - dst.ptr(y)[x] = saturate_cast(sum); + int dstY = y + i * BLOCK_DIM_Y; + + if (dstY < src.rows) + dst.ptr(dstY)[x] = saturate_cast(sum); } } } @@ -103,13 +119,13 @@ namespace cv { namespace gpu { namespace filters { template class B> void linearColumnFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) - { - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + { + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y)); B b(src.rows); - filter_krnls_column::linearColumnFilter<<>>(src, dst, anchor, b); + filter_column::linearColumnFilter<<>>(src, dst, anchor, b); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -219,7 +235,7 @@ namespace cv { namespace gpu { namespace filters } }; - filter_krnls_column::loadLinearKernel(kernel, ksize); + filter_column::loadKernel(kernel, ksize); callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); } diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu index 44de9cabd..77c1227dc 100644 --- a/modules/gpu/src/cuda/row_filter.cu +++ b/modules/gpu/src/cuda/row_filter.cu @@ -12,6 +12,7 @@ // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, @@ -51,64 +52,85 @@ using namespace cv::gpu::device; #define MAX_KERNEL_SIZE 16 #define BLOCK_DIM_X 16 -#define BLOCK_DIM_Y 16 +#define BLOCK_DIM_Y 4 +#define RESULT_STEPS 8 +#define HALO_STEPS 1 -namespace filter_krnls_row +namespace filter_row { - __constant__ float cLinearKernel[MAX_KERNEL_SIZE]; + __constant__ float c_kernel[MAX_KERNEL_SIZE]; - void loadLinearKernel(const float kernel[], int ksize) + void loadKernel(const float kernel[], int ksize) { - cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) ); + cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) ); } - template struct SmemType_ + namespace detail { - typedef typename TypeVec::cn>::vec_type smem_t; - }; - template struct SmemType_ - { - typedef T smem_t; - }; + template struct SmemType + { + typedef typename TypeVec::cn>::vec_type smem_t; + }; + + template struct SmemType + { + typedef T smem_t; + }; + } + template struct SmemType { - typedef typename SmemType_::smem_t smem_t; + typedef typename detail::SmemType::smem_t smem_t; }; - template + template __global__ void linearRowFilter(const DevMem2D_ src, PtrStep_ dst, int anchor, const B b) { typedef typename SmemType::smem_t smem_t; + typedef typename TypeVec::cn>::vec_type sum_t; - __shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; + __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X]; - const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; - const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; - - smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3; + //Offset to the left halo edge + const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x; + const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; if (y < src.rows) { - const T* rowSrc = src.ptr(y); + const T* src_row = src.ptr(y); - sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc); - sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc); - sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc); + //Load main data + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row); + + //Load left halo + #pragma unroll + for(int i = 0; i < HALO_STEPS; ++i) + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row); + + //Load right halo + #pragma unroll + for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i) + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row); __syncthreads(); - if (x < src.cols) + D* dst_row = dst.ptr(y); + + #pragma unroll + for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i) { - typedef typename TypeVec::cn>::vec_type sum_t; sum_t sum = VecTraits::all(0); - sDataRow += threadIdx.x + BLOCK_DIM_X - anchor; - #pragma unroll - for(int i = 0; i < ksize; ++i) - sum = sum + sDataRow[i] * cLinearKernel[i]; + for (int j = 0; j < KERNEL_SIZE; ++j) + sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j]; - dst.ptr(y)[x] = saturate_cast(sum); + int dstX = x + i * BLOCK_DIM_X; + + if (dstX < src.cols) + dst_row[dstX] = saturate_cast(sum); } } } @@ -119,13 +141,14 @@ namespace cv { namespace gpu { namespace filters template class B> void linearRowFilter_caller(const DevMem2D_& src, const DevMem2D_& dst, int anchor, cudaStream_t stream) { - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); + typedef typename filter_row::SmemType::smem_t smem_t; + + const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); + const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); - typedef typename filter_krnls_row::SmemType::smem_t smem_t; B b(src.cols); - filter_krnls_row::linearRowFilter<<>>(src, dst, anchor, b); + filter_row::linearRowFilter<<>>(src, dst, anchor, b); cudaSafeCall( cudaGetLastError() ); if (stream == 0) @@ -235,7 +258,7 @@ namespace cv { namespace gpu { namespace filters } }; - filter_krnls_row::loadLinearKernel(kernel, ksize); + filter_row::loadKernel(kernel, ksize); callers[brd_type][ksize]((DevMem2D_)src, (DevMem2D_)dst, anchor, stream); } diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp index de7617fc9..0f2ba634b 100644 --- a/modules/gpu/src/filtering.cpp +++ b/modules/gpu/src/filtering.cpp @@ -50,31 +50,43 @@ using namespace cv::gpu; Ptr cv::gpu::createFilter2D_GPU(const Ptr&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&, int, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createSeparableFilter_GPU(const Ptr&, const Ptr&, int, int, int, GpuMat& buf) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr(0); } +Ptr cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } Ptr cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr(0); } void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_nogpu(); } -void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); } -void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); } -void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_nogpu(); } +void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } +void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); } +void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); } +void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); } +void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); } +void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); } void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, Stream&) { throw_nogpu(); } -void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int, Stream&) { throw_nogpu(); } -void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int, Stream&) { throw_nogpu(); } -void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int, Stream&) { throw_nogpu(); } -void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); } +void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); } +void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); } +void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_nogpu(); } +void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); } +void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_nogpu(); } void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, Stream&) { throw_nogpu(); } #else @@ -130,9 +142,8 @@ namespace namespace { - class Filter2DEngine_GPU : public FilterEngine_GPU + struct Filter2DEngine_GPU : public FilterEngine_GPU { - public: Filter2DEngine_GPU(const Ptr& filter2D_, int srcType_, int dstType_) : filter2D(filter2D_), srcType(srcType_), dstType(dstType_) {} @@ -145,10 +156,13 @@ namespace dst.create(src_size, dstType); - if (stream) - stream.enqueueMemSet(dst, Scalar::all(0.0)); - else - dst.setTo(Scalar::all(0.0)); + if (roi.size() != src_size) + { + if (stream) + stream.enqueueMemSet(dst, Scalar::all(0)); + else + dst.setTo(Scalar::all(0)); + } normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); @@ -173,16 +187,29 @@ Ptr cv::gpu::createFilter2D_GPU(const Ptr& fil namespace { - class SeparableFilterEngine_GPU : public FilterEngine_GPU + struct SeparableFilterEngine_GPU : public FilterEngine_GPU { - public: - SeparableFilterEngine_GPU(const Ptr& rowFilter_, - const Ptr& columnFilter_, int srcType_, int bufType_, int dstType_) : + SeparableFilterEngine_GPU(const Ptr& rowFilter_, const Ptr& columnFilter_, + int srcType_, int bufType_, int dstType_) : rowFilter(rowFilter_), columnFilter(columnFilter_), srcType(srcType_), bufType(bufType_), dstType(dstType_) { ksize = Size(rowFilter->ksize, columnFilter->ksize); anchor = Point(rowFilter->anchor, columnFilter->anchor); + + pbuf = &buf; + } + + SeparableFilterEngine_GPU(const Ptr& rowFilter_, const Ptr& columnFilter_, + int srcType_, int bufType_, int dstType_, + GpuMat& buf_) : + rowFilter(rowFilter_), columnFilter(columnFilter_), + srcType(srcType_), bufType(bufType_), dstType(dstType_) + { + ksize = Size(rowFilter->ksize, columnFilter->ksize); + anchor = Point(rowFilter->anchor, columnFilter->anchor); + + pbuf = &buf_; } virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) @@ -193,37 +220,36 @@ namespace dst.create(src_size, dstType); - ensureSizeIsEnough(src_size, bufType, dstBuf); + if (roi.size() != src_size) + { + if (stream) + stream.enqueueMemSet(dst, Scalar::all(0)); + else + dst.setTo(Scalar::all(0)); + } - if (stream) - { - stream.enqueueMemSet(dst, Scalar::all(0)); - stream.enqueueMemSet(dstBuf, Scalar::all(0)); - } - else - { - dst = Scalar(0.0); - dstBuf = Scalar(0.0); - } + ensureSizeIsEnough(src_size, bufType, *pbuf); normalizeROI(roi, ksize, anchor, src_size); GpuMat srcROI = src(roi); GpuMat dstROI = dst(roi); - GpuMat dstBufROI = dstBuf(roi); + GpuMat bufROI = (*pbuf)(roi); - (*rowFilter)(srcROI, dstBufROI, stream); - (*columnFilter)(dstBufROI, dstROI, stream); + (*rowFilter)(srcROI, bufROI, stream); + (*columnFilter)(bufROI, dstROI, stream); } Ptr rowFilter; Ptr columnFilter; + int srcType, bufType, dstType; Size ksize; Point anchor; - GpuMat dstBuf; + GpuMat buf; + GpuMat* pbuf; }; } @@ -233,14 +259,19 @@ Ptr cv::gpu::createSeparableFilter_GPU(const Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType)); } +Ptr cv::gpu::createSeparableFilter_GPU(const Ptr& rowFilter, + const Ptr& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf) +{ + return Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf)); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// // 1D Sum Filter namespace { - class NppRowSumFilter : public BaseRowFilter_GPU + struct NppRowSumFilter : public BaseRowFilter_GPU { - public: NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {} virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) @@ -273,9 +304,8 @@ Ptr cv::gpu::getRowSumFilter_GPU(int srcType, int sumType, in namespace { - class NppColumnSumFilter : public BaseColumnFilter_GPU + struct NppColumnSumFilter : public BaseColumnFilter_GPU { - public: NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {} virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) @@ -314,9 +344,8 @@ namespace typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor); - class NPPBoxFilter : public BaseFilter_GPU + struct NPPBoxFilter : public BaseFilter_GPU { - public: NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) @@ -382,9 +411,8 @@ namespace { typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint); - class NPPMorphFilter : public BaseFilter_GPU + struct NPPMorphFilter : public BaseFilter_GPU { - public: NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) : BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} @@ -436,27 +464,64 @@ Ptr cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat namespace { - class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU + struct MorphologyFilterEngine_GPU : public FilterEngine_GPU { - public: - MorphologyFilterEngine_GPU(const Ptr& filter2D_, int type, int iters_) : - Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {} + MorphologyFilterEngine_GPU(const Ptr& filter2D_, int type_, int iters_) : + filter2D(filter2D_), type(type_), iters(iters_) + { + pbuf = &buf; + } + + MorphologyFilterEngine_GPU(const Ptr& filter2D_, int type_, int iters_, GpuMat& buf_) : + filter2D(filter2D_), type(type_), iters(iters_) + { + pbuf = &buf_; + } virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) { - if (iters > 1) - morfBuf.create(src.size(), src.type()); + CV_Assert(src.type() == type); + + Size src_size = src.size(); + + dst.create(src_size, type); + + if (roi.size() != src_size) + { + if (stream) + stream.enqueueMemSet(dst, Scalar::all(0)); + else + dst.setTo(Scalar::all(0)); + } + + normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); + + if (iters > 1) + pbuf->create(src_size, type); + + GpuMat srcROI = src(roi); + GpuMat dstROI = dst(roi); + + (*filter2D)(srcROI, dstROI, stream); - Filter2DEngine_GPU::apply(src, dst, roi, stream); for(int i = 1; i < iters; ++i) { - dst.swap(morfBuf); - Filter2DEngine_GPU::apply(morfBuf, dst, roi, stream); + dst.swap((*pbuf)); + + dstROI = dst(roi); + GpuMat bufROI = (*pbuf)(roi); + + (*filter2D)(bufROI, dstROI, stream); } } + Ptr filter2D; + + int type; int iters; - GpuMat morfBuf; + + GpuMat buf; + GpuMat* pbuf; }; } @@ -471,9 +536,20 @@ Ptr cv::gpu::createMorphologyFilter_GPU(int op, int type, cons return Ptr(new MorphologyFilterEngine_GPU(filter2D, type, iterations)); } +Ptr cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, const Point& anchor, int iterations) +{ + CV_Assert(iterations > 0); + + Size ksize = kernel.size(); + + Ptr filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor); + + return Ptr(new MorphologyFilterEngine_GPU(filter2D, type, iterations, buf)); +} + namespace { - void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations, Stream& stream) + void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream = Stream::Null()) { Mat kernel; Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); @@ -507,54 +583,76 @@ namespace else kernel = _kernel; - Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); + Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, buf, anchor, iterations); f->apply(src, dst, Rect(0,0,-1,-1), stream); } + + void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations) + { + GpuMat buf; + morphOp(op, src, dst, _kernel, buf, anchor, iterations); + } } -void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream) +void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations) { - morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, stream); + morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations); } -void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream) +void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream) { - morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, stream); + morphOp(MORPH_ERODE, src, dst, kernel, buf, anchor, iterations, stream); } -void cv::gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations, Stream& stream) +void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations) +{ + morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations); +} + +void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream) +{ + morphOp(MORPH_DILATE, src, dst, kernel, buf, anchor, iterations, stream); +} + +void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations) +{ + GpuMat buf1; + GpuMat buf2; + morphologyEx(src, dst, op, kernel, buf1, buf2, anchor, iterations); +} + +void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor, int iterations, Stream& stream) { - GpuMat temp; switch( op ) { - case MORPH_ERODE: erode( src, dst, kernel, anchor, iterations, stream); break; - case MORPH_DILATE: dilate( src, dst, kernel, anchor, iterations, stream); break; + case MORPH_ERODE: erode(src, dst, kernel, buf1, anchor, iterations, stream); break; + case MORPH_DILATE: dilate(src, dst, kernel, buf1, anchor, iterations, stream); break; case MORPH_OPEN: - erode( src, temp, kernel, anchor, iterations, stream); - dilate( temp, dst, kernel, anchor, iterations, stream); + erode(src, buf2, kernel, buf1, anchor, iterations, stream); + dilate(buf2, dst, kernel, buf1, anchor, iterations, stream); break; case CV_MOP_CLOSE: - dilate( src, temp, kernel, anchor, iterations, stream); - erode( temp, dst, kernel, anchor, iterations, stream); + dilate(src, buf2, kernel, buf1, anchor, iterations, stream); + erode(buf2, dst, kernel, buf1, anchor, iterations, stream); break; case CV_MOP_GRADIENT: - erode( src, temp, kernel, anchor, iterations, stream); - dilate( src, dst, kernel, anchor, iterations, stream); - subtract(dst, temp, dst, stream); + erode(src, buf2, kernel, buf1, anchor, iterations, stream); + dilate(src, dst, kernel, buf1, anchor, iterations, stream); + subtract(dst, buf2, dst, stream); break; case CV_MOP_TOPHAT: - erode( src, dst, kernel, anchor, iterations, stream); - dilate( dst, temp, kernel, anchor, iterations, stream); - subtract(src, temp, dst, stream); + erode(src, dst, kernel, buf1, anchor, iterations, stream); + dilate(dst, buf2, kernel, buf1, anchor, iterations, stream); + subtract(src, buf2, dst, stream); break; case CV_MOP_BLACKHAT: - dilate( src, dst, kernel, anchor, iterations, stream); - erode( dst, temp, kernel, anchor, iterations, stream); - subtract(temp, src, dst, stream); + dilate(src, dst, kernel, buf1, anchor, iterations, stream); + erode(dst, buf2, kernel, buf1, anchor, iterations, stream); + subtract(buf2, src, dst, stream); break; default: - CV_Error( CV_StsBadArg, "unknown morphological operation" ); + CV_Error(CV_StsBadArg, "unknown morphological operation"); } } @@ -566,9 +664,8 @@ namespace typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor); - class NPPLinearFilter : public BaseFilter_GPU + struct NPPLinearFilter : public BaseFilter_GPU { - public: NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) : BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} @@ -654,9 +751,8 @@ namespace typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); - class NppLinearRowFilter : public BaseRowFilter_GPU + struct NppLinearRowFilter : public BaseRowFilter_GPU { - public: NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} @@ -682,9 +778,8 @@ namespace nppFilter1D_t func; }; - class GpuLinearRowFilter : public BaseRowFilter_GPU + struct GpuLinearRowFilter : public BaseRowFilter_GPU { - public: GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} @@ -769,9 +864,8 @@ Ptr cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, namespace { - class NppLinearColumnFilter : public BaseColumnFilter_GPU + struct NppLinearColumnFilter : public BaseColumnFilter_GPU { - public: NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} @@ -797,9 +891,8 @@ namespace nppFilter1D_t func; }; - class GpuLinearColumnFilter : public BaseColumnFilter_GPU + struct GpuLinearColumnFilter : public BaseColumnFilter_GPU { - public: GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} @@ -898,8 +991,24 @@ Ptr cv::gpu::createSeparableLinearFilter_GPU(int srcType, int return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType); } -void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType, - Stream& stream) +Ptr cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf, + const Point& anchor, int rowBorderType, int columnBorderType) +{ + if (columnBorderType < 0) + columnBorderType = rowBorderType; + + int cn = CV_MAT_CN(srcType); + int bdepth = CV_32F; + int bufType = CV_MAKETYPE(bdepth, cn); + + Ptr rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType); + Ptr columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType); + + return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf); +} + +void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, + Point anchor, int rowBorderType, int columnBorderType) { if( ddepth < 0 ) ddepth = src.depth(); @@ -907,6 +1016,19 @@ void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType); + f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); +} + +void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, + Point anchor, int rowBorderType, int columnBorderType, + Stream& stream) +{ + if( ddepth < 0 ) + ddepth = src.depth(); + + dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); + + Ptr f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType); f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); } @@ -920,7 +1042,20 @@ Ptr cv::gpu::createDerivFilter_GPU(int srcType, int dstType, i return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } -void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream) +Ptr cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType) +{ + Mat kx, ky; + getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); + return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType); +} + +void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType) +{ + GpuMat buf; + Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType); +} + +void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); @@ -935,10 +1070,16 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream); + sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream); } -void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType, Stream& stream) +void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType) +{ + GpuMat buf; + Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType); +} + +void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream) { Mat kx, ky; getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); @@ -953,7 +1094,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, ky *= scale; } - sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream); + sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream); } void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, Stream& stream) @@ -1003,7 +1144,35 @@ Ptr cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); } -void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream) +Ptr cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType) +{ + int depth = CV_MAT_DEPTH(type); + + if (sigma2 <= 0) + sigma2 = sigma1; + + // automatic detection of kernel size from sigma + if (ksize.width <= 0 && sigma1 > 0) + ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1; + if (ksize.height <= 0 && sigma2 > 0) + ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1; + + CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 ); + + sigma1 = std::max(sigma1, 0.0); + sigma2 = std::max(sigma2, 0.0); + + Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) ); + Mat ky; + if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON ) + ky = kx; + else + ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) ); + + return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType); +} + +void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType) { if (ksize.width == 1 && ksize.height == 1) { @@ -1014,6 +1183,20 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si dst.create(src.size(), src.type()); Ptr f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType); + f->apply(src, dst, Rect(0, 0, src.cols, src.rows)); +} + +void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream) +{ + if (ksize.width == 1 && ksize.height == 1) + { + src.copyTo(dst); + return; + } + + dst.create(src.size(), src.type()); + + Ptr f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType); f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); } @@ -1025,9 +1208,8 @@ namespace typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, NppiSize oMaskSize, NppiPoint oAnchor); - class NPPRankFilter : public BaseFilter_GPU + struct NPPRankFilter : public BaseFilter_GPU { - public: NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index aff1348e4..512fc06ec 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -737,32 +737,6 @@ TEST(resize) } -TEST(Sobel) -{ - Mat src, dst; - gpu::GpuMat d_src, d_dst; - - for (int size = 2000; size <= 4000; size += 1000) - { - SUBTEST << "size " << size << ", 32F"; - - gen(src, size, size, CV_32F, 0, 1); - dst.create(size, size, CV_32F); - - CPU_ON; - Sobel(src, dst, dst.depth(), 1, 1); - CPU_OFF; - - d_src = src; - d_dst.create(size, size, CV_32F); - - GPU_ON; - gpu::Sobel(d_src, d_dst, d_dst.depth(), 1, 1); - GPU_OFF; - } -} - - TEST(cvtColor) { Mat src, dst; @@ -1068,26 +1042,28 @@ TEST(solvePnPRansac) TEST(GaussianBlur) { - for (int size = 1000; size < 10000; size += 3000) + for (int size = 1000; size <= 4000; size += 1000) { - SUBTEST << "16SC3, size " << size; + SUBTEST << "8UC1, size " << size; - Mat src; gen(src, size, size, CV_16SC3, 0, 256); + Mat src; gen(src, size, size, CV_8UC1, 0, 256); Mat dst(src.size(), src.type()); CPU_ON; - GaussianBlur(src, dst, Size(5,5), 0); + GaussianBlur(src, dst, Size(3, 3), 1); CPU_OFF; gpu::GpuMat d_src(src); gpu::GpuMat d_dst(src.size(), src.type()); + gpu::GpuMat d_buf; + gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1); GPU_ON; - gpu::GaussianBlur(d_src, d_dst, Size(5,5), 0); + gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1); GPU_OFF; } - for (int size = 1000; size < 10000; size += 3000) + for (int size = 1000; size <= 4000; size += 1000) { SUBTEST << "8UC4, size " << size; @@ -1095,14 +1071,37 @@ TEST(GaussianBlur) Mat dst(src.size(), src.type()); CPU_ON; - GaussianBlur(src, dst, Size(5,5), 0); + GaussianBlur(src, dst, Size(3, 3), 1); CPU_OFF; gpu::GpuMat d_src(src); gpu::GpuMat d_dst(src.size(), src.type()); + gpu::GpuMat d_buf; + gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1); GPU_ON; - gpu::GaussianBlur(d_src, d_dst, Size(5,5), 0); + gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1); + GPU_OFF; + } + + for (int size = 1000; size <= 4000; size += 1000) + { + SUBTEST << "32FC1, size " << size; + + Mat src; gen(src, size, size, CV_32FC1, 0, 1); + Mat dst(src.size(), src.type()); + + CPU_ON; + GaussianBlur(src, dst, Size(3, 3), 1); + CPU_OFF; + + gpu::GpuMat d_src(src); + gpu::GpuMat d_dst(src.size(), src.type()); + gpu::GpuMat d_buf; + gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1); + + GPU_ON; + gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1); GPU_OFF; } }