refactored Linear Filter

2013-04-29 11:51:47 +04:00
parent fcfcd4cbce
commit 1eedc6c42a
6 changed files with 416 additions and 429 deletions
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -96,6 +96,34 @@ inline void blur(InputArray src, OutputArray dst, Size ksize, Point anchor, Stre
    f->apply(src, dst, stream);
 }

+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
+
+//! non-separable linear 2D filter
+CV_EXPORTS Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1,-1),
+                                            int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernel,
+                                                Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT,
+                                                Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void filter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernel, Point anchor, int borderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createLinearFilter(src.type(), ddepth, kernel, anchor, borderType);
+    f->apply(src, dst, stream);
+}
+
+
+
+
+
+
+
+//! applies Laplacian operator to the image
+//! supports only ksize = 1 and ksize = 3
+CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
+
+



@@ -194,13 +222,7 @@ CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, co
 CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, const Mat& kernel, GpuMat& buf,
    const Point& anchor = Point(-1,-1), int iterations = 1);

-//! returns 2D filter with the specified kernel
-//! supports CV_8U, CV_16U and CV_32F one and four channel image
-CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);

-//! returns the non-separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-    Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT);

 //! returns the primitive row filter with the specified kernel.
 //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
@@ -269,9 +291,6 @@ CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat&
 CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
                             Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());

-//! applies non-separable 2D linear filter to the image
-CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
 //! applies separable 2D linear filter to the image
 CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
@@ -297,10 +316,6 @@ CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double
 CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());

-//! applies Laplacian operator to the image
-//! supports only ksize = 1 and ksize = 3
-CV_EXPORTS void Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
-
 }} // namespace cv { namespace gpu {

 #undef __OPENCV_GPUFILTERS_DEPR_BEFORE__
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -86,6 +86,51 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,
    }
 }

+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(d_src.type(), -1, kernel);
+
+        TEST_CYCLE() filter2D->apply(d_src, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+
+
+
+
+
+
+
 //////////////////////////////////////////////////////////////////////
 // Sobel

@@ -330,39 +375,3 @@ PERF_TEST_P(Sz_Type_Op, MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8
        CPU_SANITY_CHECK(dst);
    }
 }
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PERF_TEST_P(Sz_Type_KernelSz, Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-    const int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    declare.in(src, WARMUP_RNG);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    declare.in(kernel, WARMUP_RNG);
-
-    if (PERF_RUN_GPU())
-    {
-        const cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat dst;
-
-        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);
-
-        GPU_SANITY_CHECK(dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
--- a/modules/gpufilters/src/cuda/filter2d.cu
+++ b/modules/gpufilters/src/cuda/filter2d.cu
@@ -48,111 +48,104 @@

 namespace cv { namespace gpu { namespace cudev
 {
-    namespace imgproc
+    template <class SrcPtr, typename D>
+    __global__ void filter2D(const SrcPtr src, PtrStepSz<D> dst,
+                             const float* __restrict__ kernel,
+                             const int kWidth, const int kHeight,
+                             const int anchorX, const int anchorY)
    {
-        #define FILTER2D_MAX_KERNEL_SIZE 16
+        typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;

-        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;

-        template <class SrcT, typename D>
-        __global__ void filter2D(const SrcT src, PtrStepSz<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        if (x >= dst.cols || y >= dst.rows)
+            return;
+
+        sum_t res = VecTraits<sum_t>::all(0);
+        int kInd = 0;
+
+        for (int i = 0; i < kHeight; ++i)
        {
-            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            sum_t res = VecTraits<sum_t>::all(0);
-            int kInd = 0;
-
-            for (int i = 0; i < kHeight; ++i)
-            {
-                for (int j = 0; j < kWidth; ++j)
-                    res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
-            }
-
-            dst(y, x) = saturate_cast<D>(res);
+            for (int j = 0; j < kWidth; ++j)
+                res = res + src(y - anchorY + i, x - anchorX + j) * kernel[kInd++];
        }

-        template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
-
-        #define IMPLEMENT_FILTER2D_TEX_READER(type) \
-            texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_filter2D_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
-            { \
-                static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, \
-                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(16, 16); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_filter2D_ ## type , srcWhole); \
-                    tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
-                    Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
-                    filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    if (stream == 0) \
-                        cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        IMPLEMENT_FILTER2D_TEX_READER(uchar);
-        IMPLEMENT_FILTER2D_TEX_READER(uchar4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(ushort);
-        IMPLEMENT_FILTER2D_TEX_READER(ushort4);
-
-        IMPLEMENT_FILTER2D_TEX_READER(float);
-        IMPLEMENT_FILTER2D_TEX_READER(float4);
-
-        #undef IMPLEMENT_FILTER2D_TEX_READER
-
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
-            static const func_t funcs[] =
-            {
-                Filter2DCaller<T, D, BrdConstant>::call,
-                Filter2DCaller<T, D, BrdReplicate>::call,
-                Filter2DCaller<T, D, BrdReflect>::call,
-                Filter2DCaller<T, D, BrdWrap>::call,
-                Filter2DCaller<T, D, BrdReflect101>::call
-            };
-
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
-        }
-
-        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        dst(y, x) = saturate_cast<D>(res);
    }
+
+    template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+
+    #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+        texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+        struct tex_filter2D_ ## type ## _reader \
+        { \
+            typedef type elem_type; \
+            typedef int index_type; \
+            const int xoff; \
+            const int yoff; \
+            tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+            { \
+                return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
+            } \
+        }; \
+        template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
+        { \
+            static void call(const PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel, \
+                int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+            { \
+                typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                dim3 block(16, 16); \
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kernel, kWidth, kHeight, anchorX, anchorY); \
+                cudaSafeCall( cudaGetLastError() ); \
+                if (stream == 0) \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+            } \
+        };
+
+    IMPLEMENT_FILTER2D_TEX_READER(uchar);
+    IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(ushort);
+    IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+
+    IMPLEMENT_FILTER2D_TEX_READER(float);
+    IMPLEMENT_FILTER2D_TEX_READER(float4);
+
+    #undef IMPLEMENT_FILTER2D_TEX_READER
+
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream)
+    {
+        typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, const float* kernel,
+                               int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            Filter2DCaller<T, D, BrdConstant>::call,
+            Filter2DCaller<T, D, BrdReplicate>::call,
+            Filter2DCaller<T, D, BrdReflect>::call,
+            Filter2DCaller<T, D, BrdWrap>::call,
+            Filter2DCaller<T, D, BrdReflect101>::call
+        };
+
+        funcs[borderMode]((PtrStepSz<T>) srcWhole, ofsX, ofsY, (PtrStepSz<D>) dst, kernel,
+                          kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+    }
+
+    template void filter2D<uchar  , uchar  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<uchar4 , uchar4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort , ushort >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float  , float  >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
+    template void filter2D<float4 , float4 >(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel, int kWidth, int kHeight, int anchorX, int anchorY, int borderMode, const float* borderValue, cudaStream_t stream);
 }}}

 #endif // CUDA_DISABLER
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -49,6 +49,8 @@ using namespace cv::gpu;

 Ptr<Filter> cv::gpu::createBoxFilter(int, int, Size, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }

+Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
+
 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
@@ -57,8 +59,6 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) {
 Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
@@ -76,7 +76,6 @@ void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_
 void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_no_cuda(); }
 void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_no_cuda(); }
@@ -188,6 +187,138 @@ Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point
    return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }

+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear Filter
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <typename T, typename D>
+    void filter2D(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                  int kWidth, int kHeight, int anchorX, int anchorY,
+                  int borderMode, const float* borderValue, cudaStream_t stream);
+}}}
+
+namespace
+{
+    class LinearFilter : public Filter
+    {
+    public:
+        LinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*filter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, const float* kernel,
+                                   int kWidth, int kHeight, int anchorX, int anchorY,
+                                   int borderMode, const float* borderValue, cudaStream_t stream);
+
+        GpuMat kernel_;
+        Point anchor_;
+        int type_;
+        filter2D_t func_;
+        int borderMode_;
+        Scalar_<float> borderVal_;
+    };
+
+    LinearFilter::LinearFilter(int srcType, int dstType, InputArray _kernel, Point anchor, int borderMode, Scalar borderVal) :
+        anchor_(anchor), type_(srcType), borderMode_(borderMode), borderVal_(borderVal)
+    {
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int scn = CV_MAT_CN(srcType);
+
+        Mat kernel = _kernel.getMat();
+
+        CV_Assert( sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F );
+        CV_Assert( scn == 1 || scn == 4 );
+        CV_Assert( dstType == srcType );
+        CV_Assert( kernel.channels() == 1 );
+        CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+        Mat kernel32F;
+        kernel.convertTo(kernel32F, CV_32F);
+
+        kernel_ = gpu::createContinuous(kernel.size(), CV_32FC1);
+        kernel_.upload(kernel32F);
+
+        normalizeAnchor(anchor_, kernel.size());
+
+        switch (srcType)
+        {
+        case CV_8UC1:
+            func_ = cudev::filter2D<uchar, uchar>;
+            break;
+        case CV_8UC4:
+            func_ = cudev::filter2D<uchar4, uchar4>;
+            break;
+        case CV_16UC1:
+            func_ = cudev::filter2D<ushort, ushort>;
+            break;
+        case CV_16UC4:
+            func_ = cudev::filter2D<ushort4, ushort4>;
+            break;
+        case CV_32FC1:
+            func_ = cudev::filter2D<float, float>;
+            break;
+        case CV_32FC4:
+            func_ = cudev::filter2D<float4, float4>;
+            break;
+        }
+    }
+
+    void LinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == type_ );
+
+        _dst.create(src.size(), src.type());
+        GpuMat dst = _dst.getGpuMat();
+
+        Point ofs;
+        Size wholeSize;
+        src.locateROI(wholeSize, ofs);
+
+        GpuMat srcWhole(wholeSize, src.type(), src.datastart);
+
+        func_(srcWhole, ofs.x, ofs.y, dst, kernel_.ptr<float>(),
+              kernel_.cols, kernel_.rows, anchor_.x, anchor_.y,
+              borderMode_, borderVal_.val, StreamAccessor::getStream(_stream));
+    }
+}
+
+Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor, int borderMode, Scalar borderVal)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
+}
+
+
+
+
+
+
+
+
+
+
+void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
+{
+    CV_Assert(ksize == 1 || ksize == 3);
+
+    static const int K[2][9] =
+    {
+        {0, 1, 0, 1, -4, 1, 0, 1, 0},
+        {2, 0, 2, 0, -8, 0, 2, 0, 2}
+    };
+    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
+    if (scale != 1)
+        kernel *= scale;
+
+    Ptr<gpu::Filter> f = gpu::createLinearFilter(src.type(), ddepth, kernel, Point(-1,-1), borderType);
+    f->apply(src, dst, stream);
+}
+



@@ -702,172 +833,6 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
    }
 }

-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Linear Filter
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, typename D>
-        void filter2D_gpu(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                          int borderMode, const float* borderValue, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
-        const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);
-
-    struct NPPLinearFilter : public BaseFilter_GPU
-    {
-        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            NppiSize oKernelSize;
-            oKernelSize.height = ksize.height;
-            oKernelSize.width = ksize.width;
-            NppiPoint oAnchor;
-            oAnchor.x = anchor.x;
-            oAnchor.y = anchor.y;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter2D_t func;
-    };
-
-    typedef void (*gpuFilter2D_t)(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst,
-                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel,
-                                   int borderMode, const float* borderValue, cudaStream_t stream);
-
-    struct GpuFilter2D : public BaseFilter_GPU
-    {
-        GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) :
-            BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_)
-        {
-        }
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-        {
-            using namespace cv::gpu::cudev::imgproc;
-
-            Point ofs;
-            Size wholeSize;
-            src.locateROI(wholeSize, ofs);
-            GpuMat srcWhole(wholeSize, src.type(), src.datastart);
-
-            static const Scalar_<float> zero = Scalar_<float>::all(0.0f);
-            func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), brd_type, zero.val, StreamAccessor::getStream(stream));
-        }
-
-        gpuFilter2D_t func;
-        GpuMat kernel;
-        int brd_type;
-    };
-}
-
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    int sdepth = CV_MAT_DEPTH(srcType);
-    int scn = CV_MAT_CN(srcType);
-
-    CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F);
-    CV_Assert(scn == 1 || scn == 4);
-    CV_Assert(dstType == srcType);
-    CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP);
-
-    Size ksize = kernel.size();
-
-#if 0
-    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT)
-    {
-        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
-
-        GpuMat gpu_krnl;
-        int nDivisor;
-        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
-
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
-    }
-#endif
-
-    CV_Assert(ksize.width * ksize.height <= 16 * 16);
-
-    GpuMat gpu_krnl;
-    normalizeKernel(kernel, gpu_krnl, CV_32F);
-
-    normalizeAnchor(anchor, ksize);
-
-    gpuFilter2D_t func = 0;
-
-    switch (srcType)
-    {
-    case CV_8UC1:
-        func = filter2D_gpu<uchar, uchar>;
-        break;
-    case CV_8UC4:
-        func = filter2D_gpu<uchar4, uchar4>;
-        break;
-    case CV_16UC1:
-        func = filter2D_gpu<ushort, ushort>;
-        break;
-    case CV_16UC4:
-        func = filter2D_gpu<ushort4, ushort4>;
-        break;
-    case CV_32FC1:
-        func = filter2D_gpu<float, float>;
-        break;
-    case CV_32FC4:
-        func = filter2D_gpu<float4, float4>;
-        break;
-    }
-
-    return Ptr<BaseFilter_GPU>(new GpuFilter2D(ksize, anchor, func, gpu_krnl, brd_type));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType)
-{
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType);
-
-    return createFilter2D_GPU(linearFilter, srcType, dstType);
-}
-
-void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream)
-{
-    if (ddepth < 0)
-        ddepth = src.depth();
-
-    int dst_type = CV_MAKE_TYPE(ddepth, src.channels());
-
-    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType);
-
-    dst.create(src.size(), dst_type);
-
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter

@@ -1208,22 +1173,6 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
 }

-void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale, int borderType, Stream& stream)
-{
-    CV_Assert(ksize == 1 || ksize == 3);
-
-    static const int K[2][9] =
-    {
-        {0, 1, 0, 1, -4, 1, 0, 1, 0},
-        {2, 0, 2, 0, -8, 0, 2, 0, 2}
-    };
-    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
-    if (scale != 1)
-        kernel *= scale;
-
-    filter2D(src, dst, ddepth, kernel, Point(-1,-1), borderType, stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Gaussian Filter

--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -118,6 +118,121 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Blur, testing::Combine(
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));

+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        anchor = GET_PARAM(4);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Filter2D, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
+
+    cv::Ptr<cv::gpu::Filter> filter2D = cv::gpu::createLinearFilter(src.type(), -1, kernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter2D->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+
+
+
+
+
+
+
+
+
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    cv::Size ksize;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        ksize = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Laplacian, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
+
+    cv::Mat dst_gold;
+    cv::Laplacian(src, dst_gold, -1, ksize.width);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
+    WHOLE_SUBMAT));
+
+
+
+
+
+
+
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Sobel

@@ -332,49 +447,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));

-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PARAM_TEST_CASE(Laplacian, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    bool useRoi;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        useRoi = GET_PARAM(4);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Laplacian, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Laplacian(loadMat(src, useRoi), dst, -1, ksize.width);
-
-    cv::Mat dst_gold;
-    cv::Laplacian(src, dst_gold, -1, ksize.width);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 1e-3);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
-    testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
-    WHOLE_SUBMAT));
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode

@@ -527,56 +599,4 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, MorphEx, testing::Combine(
    testing::Values(Iterations(1), Iterations(2), Iterations(3)),
    WHOLE_SUBMAT));

-/////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
-{
-    cv::gpu::DeviceInfo devInfo;
-    cv::Size size;
-    int type;
-    cv::Size ksize;
-    cv::Point anchor;
-    int borderType;
-    bool useRoi;
-
-    cv::Mat img;
-
-    virtual void SetUp()
-    {
-        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
-        type = GET_PARAM(2);
-        ksize = GET_PARAM(3);
-        anchor = GET_PARAM(4);
-        borderType = GET_PARAM(5);
-        useRoi = GET_PARAM(6);
-
-        cv::gpu::setDevice(devInfo.deviceID());
-    }
-};
-
-GPU_TEST_P(Filter2D, Accuracy)
-{
-    cv::Mat src = randomMat(size, type);
-    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
-
-    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType);
-
-    cv::Mat dst_gold;
-    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
-
-    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
-}
-
-INSTANTIATE_TEST_CASE_P(GPU_Filters, Filter2D, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
-    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
-    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
-    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
-    WHOLE_SUBMAT));
-
 #endif // HAVE_CUDA
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -961,10 +961,11 @@ TEST(filter2D)
            gpu::GpuMat d_src(src);
            gpu::GpuMat d_dst;

-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            Ptr<gpu::Filter> filter2D = gpu::createLinearFilter(d_src.type(), -1, kernel);
+            filter2D->apply(d_src, d_dst);

            GPU_ON;
-            gpu::filter2D(d_src, d_dst, -1, kernel);
+            filter2D->apply(d_src, d_dst);
            GPU_OFF;
        }
    }