optimized gpu filters, added buffered version for different filters

This commit is contained in:
Vladislav Vinogradov 2011-09-28 11:52:26 +00:00
parent 340e23a4f0
commit 9da6d78989
6 changed files with 440 additions and 200 deletions

View File

@ -340,6 +340,8 @@ namespace cv
//! returns the separable filter engine with the specified filters //! returns the separable filter engine with the specified filters
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType); const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);
//! returns horizontal 1D box filter //! returns horizontal 1D box filter
//! supports only CV_8UC1 source type and CV_32FC1 sum type //! supports only CV_8UC1 source type and CV_32FC1 sum type
@ -367,6 +369,8 @@ namespace cv
//! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported. //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel,
const Point& anchor = Point(-1,-1), int iterations = 1); const Point& anchor = Point(-1,-1), int iterations = 1);
CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
const Point& anchor = Point(-1,-1), int iterations = 1);
//! returns 2D filter with the specified kernel //! returns 2D filter with the specified kernel
//! supports CV_8UC1 and CV_8UC4 types //! supports CV_8UC1 and CV_8UC4 types
@ -386,7 +390,7 @@ namespace cv
//! OpenCV version supports only CV_32F as buffer depth and //! OpenCV version supports only CV_32F as buffer depth and
//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
int anchor = -1, int borderType = BORDER_CONSTANT); int anchor = -1, int borderType = BORDER_DEFAULT);
//! returns the primitive column filter with the specified kernel. //! returns the primitive column filter with the specified kernel.
//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type. //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
@ -397,20 +401,27 @@ namespace cv
//! OpenCV version supports only CV_32F as buffer depth and //! OpenCV version supports only CV_32F as buffer depth and
//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types. //! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
int anchor = -1, int borderType = BORDER_CONSTANT); int anchor = -1, int borderType = BORDER_DEFAULT);
//! returns the separable linear filter engine //! returns the separable linear filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
int columnBorderType = -1); int columnBorderType = -1);
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
int columnBorderType = -1);
//! returns filter engine for the generalized Sobel operator //! returns filter engine for the generalized Sobel operator
CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! returns the Gaussian filter engine //! returns the Gaussian filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
//! returns maximum filter //! returns maximum filter
CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1)); CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
@ -426,31 +437,42 @@ namespace cv
static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); } static inline void blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null()) { boxFilter(src, dst, -1, ksize, anchor, stream); }
//! erodes the image (applies the local minimum operator) //! erodes the image (applies the local minimum operator)
CV_EXPORTS void erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
CV_EXPORTS void erode(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
//! dilates the image (applies the local maximum operator) //! dilates the image (applies the local maximum operator)
CV_EXPORTS void dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
CV_EXPORTS void dilate(const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
//! applies an advanced morphological operation to the image //! applies an advanced morphological operation to the image
CV_EXPORTS void morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null()); CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor = Point(-1, -1), int iterations = 1);
CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
//! applies non-separable 2D linear filter to the image //! applies non-separable 2D linear filter to the image
CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null()); CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());
//! applies separable 2D linear filter to the image //! applies separable 2D linear filter to the image
CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! applies generalized Sobel operator to the image //! applies generalized Sobel operator to the image
CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! applies the vertical or horizontal Scharr operator to the image //! applies the vertical or horizontal Scharr operator to the image
CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1, CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! smooths the image using Gaussian filter. //! smooths the image using Gaussian filter.
CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0, CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null()); int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
//! applies Laplacian operator to the image //! applies Laplacian operator to the image

View File

@ -101,17 +101,15 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
SANITY_CHECK(dst_host); SANITY_CHECK(dst_host);
} }
PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()), PERF_TEST_P(DevInfo_Size_MatType_KernelSize, separableLinearFilter, testing::Combine(testing::ValuesIn(devices()),
testing::Values(GPU_TYPICAL_MAT_SIZES), testing::Values(GPU_TYPICAL_MAT_SIZES),
testing::Values(CV_8UC1, CV_8UC4, CV_16SC3, CV_32FC1), testing::Values(CV_8UC1, CV_8UC4, CV_32FC1),
testing::Values(3, 5), testing::Values(3, 5)))
testing::Values((int)BORDER_REFLECT101, (int)BORDER_CONSTANT)))
{ {
DeviceInfo devInfo = std::tr1::get<0>(GetParam()); DeviceInfo devInfo = std::tr1::get<0>(GetParam());
Size size = std::tr1::get<1>(GetParam()); Size size = std::tr1::get<1>(GetParam());
int type = std::tr1::get<2>(GetParam()); int type = std::tr1::get<2>(GetParam());
int ksize = std::tr1::get<3>(GetParam()); int ksize = std::tr1::get<3>(GetParam());
int borderMode = std::tr1::get<4>(GetParam());
setDevice(devInfo.deviceID()); setDevice(devInfo.deviceID());
@ -123,7 +121,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize_BorderMode, separableLinearFilter, t
GpuMat dst(size, type); GpuMat dst(size, type);
Mat kernel = getGaussianKernel(ksize, 0.5, CV_32F); Mat kernel = getGaussianKernel(ksize, 0.5, CV_32F);
Ptr<FilterEngine_GPU> filter = createSeparableLinearFilter_GPU(type, type, kernel, kernel, Point(-1,-1), borderMode); Ptr<FilterEngine_GPU> filter = createSeparableLinearFilter_GPU(type, type, kernel, kernel, Point(-1,-1));
declare.time(1.0).iterations(100); declare.time(1.0).iterations(100);

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -51,49 +52,64 @@ using namespace cv::gpu::device;
#define MAX_KERNEL_SIZE 16 #define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16 #define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16 #define BLOCK_DIM_Y 8
#define RESULT_STEPS 8
#define HALO_STEPS 1
namespace filter_krnls_column namespace filter_column
{ {
__constant__ float cLinearKernel[MAX_KERNEL_SIZE]; __constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadLinearKernel(const float kernel[], int ksize) void loadKernel(const float kernel[], int ksize)
{ {
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
} }
template <int ksize, typename T, typename D, typename B> template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b) __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{ {
__shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y;
T* sDataColumn = smem + threadIdx.x; //Offset to the upper halo edge
const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
if (x < src.cols) if (x < src.cols)
{ {
const T* srcCol = src.ptr() + x; const T* src_col = src.ptr() + x;
sDataColumn[ threadIdx.y * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step); //Main data
sDataColumn[(threadIdx.y + BLOCK_DIM_Y) * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step); #pragma unroll
sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step); for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
//Upper halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
//Lower halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]= b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
__syncthreads(); __syncthreads();
if (y < src.rows) #pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0); sum_t sum = VecTraits<sum_t>::all(0);
sDataColumn += (threadIdx.y + BLOCK_DIM_Y - anchor) * BLOCK_DIM_X;
#pragma unroll #pragma unroll
for(int i = 0; i < ksize; ++i) for(int j = 0; j < KERNEL_SIZE; ++j)
sum = sum + sDataColumn[i * BLOCK_DIM_X] * cLinearKernel[i]; sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
dst.ptr(y)[x] = saturate_cast<D>(sum); int dstY = y + i * BLOCK_DIM_Y;
if (dstY < src.rows)
dst.ptr(dstY)[x] = saturate_cast<D>(sum);
} }
} }
} }
@ -103,13 +119,13 @@ namespace cv { namespace gpu { namespace filters
{ {
template <int ksize, typename T, typename D, template<typename> class B> template <int ksize, typename T, typename D, template<typename> class B>
void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream) void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{ {
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y)); const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
B<T> b(src.rows); B<T> b(src.rows);
filter_krnls_column::linearColumnFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b); filter_column::linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
@ -219,7 +235,7 @@ namespace cv { namespace gpu { namespace filters
} }
}; };
filter_krnls_column::loadLinearKernel(kernel, ksize); filter_column::loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
} }

View File

@ -12,6 +12,7 @@
// //
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
@ -51,64 +52,85 @@ using namespace cv::gpu::device;
#define MAX_KERNEL_SIZE 16 #define MAX_KERNEL_SIZE 16
#define BLOCK_DIM_X 16 #define BLOCK_DIM_X 16
#define BLOCK_DIM_Y 16 #define BLOCK_DIM_Y 4
#define RESULT_STEPS 8
#define HALO_STEPS 1
namespace filter_krnls_row namespace filter_row
{ {
__constant__ float cLinearKernel[MAX_KERNEL_SIZE]; __constant__ float c_kernel[MAX_KERNEL_SIZE];
void loadLinearKernel(const float kernel[], int ksize) void loadKernel(const float kernel[], int ksize)
{ {
cudaSafeCall( cudaMemcpyToSymbol(cLinearKernel, kernel, ksize * sizeof(float)) ); cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
} }
template <typename T, size_t size> struct SmemType_ namespace detail
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t; template <typename T, size_t size> struct SmemType
}; {
template <typename T> struct SmemType_<T, 4> typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
{ };
typedef T smem_t;
}; template <typename T> struct SmemType<T, 4>
{
typedef T smem_t;
};
}
template <typename T> struct SmemType template <typename T> struct SmemType
{ {
typedef typename SmemType_<T, sizeof(T)>::smem_t smem_t; typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
}; };
template <int ksize, typename T, typename D, typename B> template <int KERNEL_SIZE, typename T, typename D, typename B>
__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b) __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep_<D> dst, int anchor, const B b)
{ {
typedef typename SmemType<T>::smem_t smem_t; typedef typename SmemType<T>::smem_t smem_t;
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
__shared__ smem_t smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3]; __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
const int x = BLOCK_DIM_X * blockIdx.x + threadIdx.x; //Offset to the left halo edge
const int y = BLOCK_DIM_Y * blockIdx.y + threadIdx.y; const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
smem_t* sDataRow = smem + threadIdx.y * BLOCK_DIM_X * 3;
if (y < src.rows) if (y < src.rows)
{ {
const T* rowSrc = src.ptr(y); const T* src_row = src.ptr(y);
sDataRow[threadIdx.x ] = b.at_low(x - BLOCK_DIM_X, rowSrc); //Load main data
sDataRow[threadIdx.x + BLOCK_DIM_X ] = b.at_high(x, rowSrc); #pragma unroll
sDataRow[threadIdx.x + BLOCK_DIM_X * 2] = b.at_high(x + BLOCK_DIM_X, rowSrc); for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
//Load left halo
#pragma unroll
for(int i = 0; i < HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
//Load right halo
#pragma unroll
for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
__syncthreads(); __syncthreads();
if (x < src.cols) D* dst_row = dst.ptr(y);
#pragma unroll
for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
{ {
typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
sum_t sum = VecTraits<sum_t>::all(0); sum_t sum = VecTraits<sum_t>::all(0);
sDataRow += threadIdx.x + BLOCK_DIM_X - anchor;
#pragma unroll #pragma unroll
for(int i = 0; i < ksize; ++i) for (int j = 0; j < KERNEL_SIZE; ++j)
sum = sum + sDataRow[i] * cLinearKernel[i]; sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
dst.ptr(y)[x] = saturate_cast<D>(sum); int dstX = x + i * BLOCK_DIM_X;
if (dstX < src.cols)
dst_row[dstX] = saturate_cast<D>(sum);
} }
} }
} }
@ -119,13 +141,14 @@ namespace cv { namespace gpu { namespace filters
template <int ksize, typename T, typename D, template<typename> class B> template <int ksize, typename T, typename D, template<typename> class B>
void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream) void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
{ {
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); typedef typename filter_row::SmemType<T>::smem_t smem_t;
dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
typedef typename filter_krnls_row::SmemType<T>::smem_t smem_t;
B<smem_t> b(src.cols); B<smem_t> b(src.cols);
filter_krnls_row::linearRowFilter<ksize, T, D><<<grid, threads, 0, stream>>>(src, dst, anchor, b); filter_row::linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
if (stream == 0) if (stream == 0)
@ -235,7 +258,7 @@ namespace cv { namespace gpu { namespace filters
} }
}; };
filter_krnls_row::loadLinearKernel(kernel, ksize); filter_row::loadKernel(kernel, ksize);
callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream); callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
} }

View File

@ -50,31 +50,43 @@ using namespace cv::gpu;
Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat& buf) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); } Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); } Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); } Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); } Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); } Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); } Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); } Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); } Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); } Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); } Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_nogpu(); } void cv::gpu::boxFilter(const GpuMat&, GpuMat&, int, Size, Point, Stream&) { throw_nogpu(); }
void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); } void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int, Stream&) { throw_nogpu(); } void cv::gpu::erode(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_nogpu(); } void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }
void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, Stream&) { throw_nogpu(); } void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, Stream&) { throw_nogpu(); }
void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int, Stream&) { throw_nogpu(); } void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); }
void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int, Stream&) { throw_nogpu(); } void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int, Stream&) { throw_nogpu(); } void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int, Stream&) { throw_nogpu(); } void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_nogpu(); }
void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_nogpu(); }
void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_nogpu(); }
void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, Stream&) { throw_nogpu(); } void cv::gpu::Laplacian(const GpuMat&, GpuMat&, int, int, double, Stream&) { throw_nogpu(); }
#else #else
@ -130,9 +142,8 @@ namespace
namespace namespace
{ {
class Filter2DEngine_GPU : public FilterEngine_GPU struct Filter2DEngine_GPU : public FilterEngine_GPU
{ {
public:
Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) : Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
filter2D(filter2D_), srcType(srcType_), dstType(dstType_) filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
{} {}
@ -145,10 +156,13 @@ namespace
dst.create(src_size, dstType); dst.create(src_size, dstType);
if (stream) if (roi.size() != src_size)
stream.enqueueMemSet(dst, Scalar::all(0.0)); {
else if (stream)
dst.setTo(Scalar::all(0.0)); stream.enqueueMemSet(dst, Scalar::all(0));
else
dst.setTo(Scalar::all(0));
}
normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size); normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
@ -173,16 +187,29 @@ Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>& fil
namespace namespace
{ {
class SeparableFilterEngine_GPU : public FilterEngine_GPU struct SeparableFilterEngine_GPU : public FilterEngine_GPU
{ {
public: SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, int srcType_, int bufType_, int dstType_) :
const Ptr<BaseColumnFilter_GPU>& columnFilter_, int srcType_, int bufType_, int dstType_) :
rowFilter(rowFilter_), columnFilter(columnFilter_), rowFilter(rowFilter_), columnFilter(columnFilter_),
srcType(srcType_), bufType(bufType_), dstType(dstType_) srcType(srcType_), bufType(bufType_), dstType(dstType_)
{ {
ksize = Size(rowFilter->ksize, columnFilter->ksize); ksize = Size(rowFilter->ksize, columnFilter->ksize);
anchor = Point(rowFilter->anchor, columnFilter->anchor); anchor = Point(rowFilter->anchor, columnFilter->anchor);
pbuf = &buf;
}
SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
int srcType_, int bufType_, int dstType_,
GpuMat& buf_) :
rowFilter(rowFilter_), columnFilter(columnFilter_),
srcType(srcType_), bufType(bufType_), dstType(dstType_)
{
ksize = Size(rowFilter->ksize, columnFilter->ksize);
anchor = Point(rowFilter->anchor, columnFilter->anchor);
pbuf = &buf_;
} }
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
@ -193,37 +220,36 @@ namespace
dst.create(src_size, dstType); dst.create(src_size, dstType);
ensureSizeIsEnough(src_size, bufType, dstBuf); if (roi.size() != src_size)
{
if (stream)
stream.enqueueMemSet(dst, Scalar::all(0));
else
dst.setTo(Scalar::all(0));
}
if (stream) ensureSizeIsEnough(src_size, bufType, *pbuf);
{
stream.enqueueMemSet(dst, Scalar::all(0));
stream.enqueueMemSet(dstBuf, Scalar::all(0));
}
else
{
dst = Scalar(0.0);
dstBuf = Scalar(0.0);
}
normalizeROI(roi, ksize, anchor, src_size); normalizeROI(roi, ksize, anchor, src_size);
GpuMat srcROI = src(roi); GpuMat srcROI = src(roi);
GpuMat dstROI = dst(roi); GpuMat dstROI = dst(roi);
GpuMat dstBufROI = dstBuf(roi); GpuMat bufROI = (*pbuf)(roi);
(*rowFilter)(srcROI, dstBufROI, stream); (*rowFilter)(srcROI, bufROI, stream);
(*columnFilter)(dstBufROI, dstROI, stream); (*columnFilter)(bufROI, dstROI, stream);
} }
Ptr<BaseRowFilter_GPU> rowFilter; Ptr<BaseRowFilter_GPU> rowFilter;
Ptr<BaseColumnFilter_GPU> columnFilter; Ptr<BaseColumnFilter_GPU> columnFilter;
int srcType, bufType, dstType; int srcType, bufType, dstType;
Size ksize; Size ksize;
Point anchor; Point anchor;
GpuMat dstBuf; GpuMat buf;
GpuMat* pbuf;
}; };
} }
@ -233,14 +259,19 @@ Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter
return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType)); return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
} }
Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
{
return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
}
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// 1D Sum Filter // 1D Sum Filter
namespace namespace
{ {
class NppRowSumFilter : public BaseRowFilter_GPU struct NppRowSumFilter : public BaseRowFilter_GPU
{ {
public:
NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {} NppRowSumFilter(int ksize_, int anchor_) : BaseRowFilter_GPU(ksize_, anchor_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@ -273,9 +304,8 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int srcType, int sumType, in
namespace namespace
{ {
class NppColumnSumFilter : public BaseColumnFilter_GPU struct NppColumnSumFilter : public BaseColumnFilter_GPU
{ {
public:
NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {} NppColumnSumFilter(int ksize_, int anchor_) : BaseColumnFilter_GPU(ksize_, anchor_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@ -314,9 +344,8 @@ namespace
typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
NppiSize oMaskSize, NppiPoint oAnchor); NppiSize oMaskSize, NppiPoint oAnchor);
class NPPBoxFilter : public BaseFilter_GPU struct NPPBoxFilter : public BaseFilter_GPU
{ {
public:
NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} NPPBoxFilter(const Size& ksize_, const Point& anchor_, nppFilterBox_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@ -382,9 +411,8 @@ namespace
{ {
typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint); typedef NppStatus (*nppMorfFilter_t)(const Npp8u*, Npp32s, Npp8u*, Npp32s, NppiSize, const Npp8u*, NppiSize, NppiPoint);
class NPPMorphFilter : public BaseFilter_GPU struct NPPMorphFilter : public BaseFilter_GPU
{ {
public:
NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) : NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {} BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
@ -436,27 +464,64 @@ Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat
namespace namespace
{ {
class MorphologyFilterEngine_GPU : public Filter2DEngine_GPU struct MorphologyFilterEngine_GPU : public FilterEngine_GPU
{ {
public: MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) :
MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type, int iters_) : filter2D(filter2D_), type(type_), iters(iters_)
Filter2DEngine_GPU(filter2D_, type, type), iters(iters_) {} {
pbuf = &buf;
}
MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) :
filter2D(filter2D_), type(type_), iters(iters_)
{
pbuf = &buf_;
}
virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
{ {
if (iters > 1) CV_Assert(src.type() == type);
morfBuf.create(src.size(), src.type());
Size src_size = src.size();
dst.create(src_size, type);
if (roi.size() != src_size)
{
if (stream)
stream.enqueueMemSet(dst, Scalar::all(0));
else
dst.setTo(Scalar::all(0));
}
normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
if (iters > 1)
pbuf->create(src_size, type);
GpuMat srcROI = src(roi);
GpuMat dstROI = dst(roi);
(*filter2D)(srcROI, dstROI, stream);
Filter2DEngine_GPU::apply(src, dst, roi, stream);
for(int i = 1; i < iters; ++i) for(int i = 1; i < iters; ++i)
{ {
dst.swap(morfBuf); dst.swap((*pbuf));
Filter2DEngine_GPU::apply(morfBuf, dst, roi, stream);
dstROI = dst(roi);
GpuMat bufROI = (*pbuf)(roi);
(*filter2D)(bufROI, dstROI, stream);
} }
} }
Ptr<BaseFilter_GPU> filter2D;
int type;
int iters; int iters;
GpuMat morfBuf;
GpuMat buf;
GpuMat* pbuf;
}; };
} }
@ -471,9 +536,20 @@ Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, cons
return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations)); return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations));
} }
Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf, const Point& anchor, int iterations)
{
CV_Assert(iterations > 0);
Size ksize = kernel.size();
Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, type, iterations, buf));
}
namespace namespace
{ {
void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations, Stream& stream) void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream = Stream::Null())
{ {
Mat kernel; Mat kernel;
Size ksize = _kernel.data ? _kernel.size() : Size(3, 3); Size ksize = _kernel.data ? _kernel.size() : Size(3, 3);
@ -507,54 +583,76 @@ namespace
else else
kernel = _kernel; kernel = _kernel;
Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); Ptr<FilterEngine_GPU> f = createMorphologyFilter_GPU(op, src.type(), kernel, buf, anchor, iterations);
f->apply(src, dst, Rect(0,0,-1,-1), stream); f->apply(src, dst, Rect(0,0,-1,-1), stream);
} }
void morphOp(int op, const GpuMat& src, GpuMat& dst, const Mat& _kernel, Point anchor, int iterations)
{
GpuMat buf;
morphOp(op, src, dst, _kernel, buf, anchor, iterations);
}
} }
void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream) void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
{ {
morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations, stream); morphOp(MORPH_ERODE, src, dst, kernel, anchor, iterations);
} }
void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations, Stream& stream) void cv::gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
{ {
morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations, stream); morphOp(MORPH_ERODE, src, dst, kernel, buf, anchor, iterations, stream);
} }
void cv::gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations, Stream& stream) void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor, int iterations)
{
morphOp(MORPH_DILATE, src, dst, kernel, anchor, iterations);
}
void cv::gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor, int iterations, Stream& stream)
{
morphOp(MORPH_DILATE, src, dst, kernel, buf, anchor, iterations, stream);
}
void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor, int iterations)
{
GpuMat buf1;
GpuMat buf2;
morphologyEx(src, dst, op, kernel, buf1, buf2, anchor, iterations);
}
void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor, int iterations, Stream& stream)
{ {
GpuMat temp;
switch( op ) switch( op )
{ {
case MORPH_ERODE: erode( src, dst, kernel, anchor, iterations, stream); break; case MORPH_ERODE: erode(src, dst, kernel, buf1, anchor, iterations, stream); break;
case MORPH_DILATE: dilate( src, dst, kernel, anchor, iterations, stream); break; case MORPH_DILATE: dilate(src, dst, kernel, buf1, anchor, iterations, stream); break;
case MORPH_OPEN: case MORPH_OPEN:
erode( src, temp, kernel, anchor, iterations, stream); erode(src, buf2, kernel, buf1, anchor, iterations, stream);
dilate( temp, dst, kernel, anchor, iterations, stream); dilate(buf2, dst, kernel, buf1, anchor, iterations, stream);
break; break;
case CV_MOP_CLOSE: case CV_MOP_CLOSE:
dilate( src, temp, kernel, anchor, iterations, stream); dilate(src, buf2, kernel, buf1, anchor, iterations, stream);
erode( temp, dst, kernel, anchor, iterations, stream); erode(buf2, dst, kernel, buf1, anchor, iterations, stream);
break; break;
case CV_MOP_GRADIENT: case CV_MOP_GRADIENT:
erode( src, temp, kernel, anchor, iterations, stream); erode(src, buf2, kernel, buf1, anchor, iterations, stream);
dilate( src, dst, kernel, anchor, iterations, stream); dilate(src, dst, kernel, buf1, anchor, iterations, stream);
subtract(dst, temp, dst, stream); subtract(dst, buf2, dst, stream);
break; break;
case CV_MOP_TOPHAT: case CV_MOP_TOPHAT:
erode( src, dst, kernel, anchor, iterations, stream); erode(src, dst, kernel, buf1, anchor, iterations, stream);
dilate( dst, temp, kernel, anchor, iterations, stream); dilate(dst, buf2, kernel, buf1, anchor, iterations, stream);
subtract(src, temp, dst, stream); subtract(src, buf2, dst, stream);
break; break;
case CV_MOP_BLACKHAT: case CV_MOP_BLACKHAT:
dilate( src, dst, kernel, anchor, iterations, stream); dilate(src, dst, kernel, buf1, anchor, iterations, stream);
erode( dst, temp, kernel, anchor, iterations, stream); erode(dst, buf2, kernel, buf1, anchor, iterations, stream);
subtract(temp, src, dst, stream); subtract(buf2, src, dst, stream);
break; break;
default: default:
CV_Error( CV_StsBadArg, "unknown morphological operation" ); CV_Error(CV_StsBadArg, "unknown morphological operation");
} }
} }
@ -566,9 +664,8 @@ namespace
typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor); const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);
class NPPLinearFilter : public BaseFilter_GPU struct NPPLinearFilter : public BaseFilter_GPU
{ {
public:
NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) : NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
@ -654,9 +751,8 @@ namespace
typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream); typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
class NppLinearRowFilter : public BaseRowFilter_GPU struct NppLinearRowFilter : public BaseRowFilter_GPU
{ {
public:
NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
@ -682,9 +778,8 @@ namespace
nppFilter1D_t func; nppFilter1D_t func;
}; };
class GpuLinearRowFilter : public BaseRowFilter_GPU struct GpuLinearRowFilter : public BaseRowFilter_GPU
{ {
public:
GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
@ -769,9 +864,8 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
namespace namespace
{ {
class NppLinearColumnFilter : public BaseColumnFilter_GPU struct NppLinearColumnFilter : public BaseColumnFilter_GPU
{ {
public:
NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {} BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
@ -797,9 +891,8 @@ namespace
nppFilter1D_t func; nppFilter1D_t func;
}; };
class GpuLinearColumnFilter : public BaseColumnFilter_GPU struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
{ {
public:
GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {} BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
@ -898,8 +991,24 @@ Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int
return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType); return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
} }
void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, int rowBorderType, int columnBorderType, Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
Stream& stream) const Point& anchor, int rowBorderType, int columnBorderType)
{
if (columnBorderType < 0)
columnBorderType = rowBorderType;
int cn = CV_MAT_CN(srcType);
int bdepth = CV_32F;
int bufType = CV_MAKETYPE(bdepth, cn);
Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
}
void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
Point anchor, int rowBorderType, int columnBorderType)
{ {
if( ddepth < 0 ) if( ddepth < 0 )
ddepth = src.depth(); ddepth = src.depth();
@ -907,6 +1016,19 @@ void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat&
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels())); dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType); Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
}
void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
Point anchor, int rowBorderType, int columnBorderType,
Stream& stream)
{
if( ddepth < 0 )
ddepth = src.depth();
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
} }
@ -920,7 +1042,20 @@ Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, i
return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
} }
void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream) Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType)
{
Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
}
void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
{
GpuMat buf;
Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType);
}
void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
{ {
Mat kx, ky; Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F); getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
@ -935,10 +1070,16 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
ky *= scale; ky *= scale;
} }
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream); sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
} }
void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType, Stream& stream) void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
{
GpuMat buf;
Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType);
}
void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream)
{ {
Mat kx, ky; Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F); getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
@ -953,7 +1094,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
ky *= scale; ky *= scale;
} }
sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), rowBorderType, columnBorderType, stream); sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
} }
void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, Stream& stream) void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, Stream& stream)
@ -1003,7 +1144,35 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType); return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
} }
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream) Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
{
int depth = CV_MAT_DEPTH(type);
if (sigma2 <= 0)
sigma2 = sigma1;
// automatic detection of kernel size from sigma
if (ksize.width <= 0 && sigma1 > 0)
ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
if (ksize.height <= 0 && sigma2 > 0)
ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
sigma1 = std::max(sigma1, 0.0);
sigma2 = std::max(sigma2, 0.0);
Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
Mat ky;
if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
ky = kx;
else
ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
}
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
{ {
if (ksize.width == 1 && ksize.height == 1) if (ksize.width == 1 && ksize.height == 1)
{ {
@ -1014,6 +1183,20 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si
dst.create(src.size(), src.type()); dst.create(src.size(), src.type());
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType); Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
}
void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
{
if (ksize.width == 1 && ksize.height == 1)
{
src.copyTo(dst);
return;
}
dst.create(src.size(), src.type());
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream); f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
} }
@ -1025,9 +1208,8 @@ namespace
typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
NppiSize oMaskSize, NppiPoint oAnchor); NppiSize oMaskSize, NppiPoint oAnchor);
class NPPRankFilter : public BaseFilter_GPU struct NPPRankFilter : public BaseFilter_GPU
{ {
public:
NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {} NPPRankFilter(const Size& ksize_, const Point& anchor_, nppFilterRank_t func_) : BaseFilter_GPU(ksize_, anchor_), func(func_) {}
virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null()) virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())

View File

@ -737,32 +737,6 @@ TEST(resize)
} }
TEST(Sobel)
{
Mat src, dst;
gpu::GpuMat d_src, d_dst;
for (int size = 2000; size <= 4000; size += 1000)
{
SUBTEST << "size " << size << ", 32F";
gen(src, size, size, CV_32F, 0, 1);
dst.create(size, size, CV_32F);
CPU_ON;
Sobel(src, dst, dst.depth(), 1, 1);
CPU_OFF;
d_src = src;
d_dst.create(size, size, CV_32F);
GPU_ON;
gpu::Sobel(d_src, d_dst, d_dst.depth(), 1, 1);
GPU_OFF;
}
}
TEST(cvtColor) TEST(cvtColor)
{ {
Mat src, dst; Mat src, dst;
@ -1068,26 +1042,28 @@ TEST(solvePnPRansac)
TEST(GaussianBlur) TEST(GaussianBlur)
{ {
for (int size = 1000; size < 10000; size += 3000) for (int size = 1000; size <= 4000; size += 1000)
{ {
SUBTEST << "16SC3, size " << size; SUBTEST << "8UC1, size " << size;
Mat src; gen(src, size, size, CV_16SC3, 0, 256); Mat src; gen(src, size, size, CV_8UC1, 0, 256);
Mat dst(src.size(), src.type()); Mat dst(src.size(), src.type());
CPU_ON; CPU_ON;
GaussianBlur(src, dst, Size(5,5), 0); GaussianBlur(src, dst, Size(3, 3), 1);
CPU_OFF; CPU_OFF;
gpu::GpuMat d_src(src); gpu::GpuMat d_src(src);
gpu::GpuMat d_dst(src.size(), src.type()); gpu::GpuMat d_dst(src.size(), src.type());
gpu::GpuMat d_buf;
gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
GPU_ON; GPU_ON;
gpu::GaussianBlur(d_src, d_dst, Size(5,5), 0); gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
GPU_OFF; GPU_OFF;
} }
for (int size = 1000; size < 10000; size += 3000) for (int size = 1000; size <= 4000; size += 1000)
{ {
SUBTEST << "8UC4, size " << size; SUBTEST << "8UC4, size " << size;
@ -1095,14 +1071,37 @@ TEST(GaussianBlur)
Mat dst(src.size(), src.type()); Mat dst(src.size(), src.type());
CPU_ON; CPU_ON;
GaussianBlur(src, dst, Size(5,5), 0); GaussianBlur(src, dst, Size(3, 3), 1);
CPU_OFF; CPU_OFF;
gpu::GpuMat d_src(src); gpu::GpuMat d_src(src);
gpu::GpuMat d_dst(src.size(), src.type()); gpu::GpuMat d_dst(src.size(), src.type());
gpu::GpuMat d_buf;
gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
GPU_ON; GPU_ON;
gpu::GaussianBlur(d_src, d_dst, Size(5,5), 0); gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
GPU_OFF;
}
for (int size = 1000; size <= 4000; size += 1000)
{
SUBTEST << "32FC1, size " << size;
Mat src; gen(src, size, size, CV_32FC1, 0, 1);
Mat dst(src.size(), src.type());
CPU_ON;
GaussianBlur(src, dst, Size(3, 3), 1);
CPU_OFF;
gpu::GpuMat d_src(src);
gpu::GpuMat d_dst(src.size(), src.type());
gpu::GpuMat d_buf;
gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
GPU_ON;
gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
GPU_OFF; GPU_OFF;
} }
} }