diff --git a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
index 0c821745f..cc73da9d9 100644
--- a/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
+++ b/modules/gpufeatures2d/include/opencv2/gpufeatures2d.hpp
@@ -351,7 +351,7 @@ private:
 
     FAST_GPU fastDetector_;
 
-    Ptr<FilterEngine_GPU> blurFilter;
+    Ptr<gpu::Filter> blurFilter;
 
     GpuMat d_keypoints_;
 };
diff --git a/modules/gpufeatures2d/src/orb.cpp b/modules/gpufeatures2d/src/orb.cpp
index 495ca3f6e..7cb1cbecc 100644
--- a/modules/gpufeatures2d/src/orb.cpp
+++ b/modules/gpufeatures2d/src/orb.cpp
@@ -468,7 +468,7 @@ cv::gpu::ORB_GPU::ORB_GPU(int nFeatures, float scaleFactor, int nLevels, int edg
 
     pattern_.upload(h_pattern);
 
-    blurFilter = createGaussianFilter_GPU(CV_8UC1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
+    blurFilter = gpu::createGaussianFilter(CV_8UC1, -1, Size(7, 7), 2, 2, BORDER_REFLECT_101);
 
     blurForDescriptor = false;
 }
@@ -632,7 +632,7 @@ void cv::gpu::ORB_GPU::computeDescriptors(GpuMat& descriptors)
         {
             // preprocess the resized image
             ensureSizeIsEnough(imagePyr_[level].size(), imagePyr_[level].type(), buf_);
-            blurFilter->apply(imagePyr_[level], buf_, Rect(0, 0, imagePyr_[level].cols, imagePyr_[level].rows));
+            blurFilter->apply(imagePyr_[level], buf_);
         }
 
         computeOrbDescriptor_gpu(blurForDescriptor ? buf_ : imagePyr_[level], keyPointsPyr_[level].ptr<short2>(0), keyPointsPyr_[level].ptr<float>(2),
diff --git a/modules/gpufilters/include/opencv2/gpufilters.hpp b/modules/gpufilters/include/opencv2/gpufilters.hpp
index 1133ae01d..bf7a8e928 100644
--- a/modules/gpufilters/include/opencv2/gpufilters.hpp
+++ b/modules/gpufilters/include/opencv2/gpufilters.hpp
@@ -131,6 +131,83 @@ inline void Laplacian(InputArray src, OutputArray dst, int ddepth, int ksize, do
     f->apply(src, dst, stream);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
+
+//! separable linear 2D filter
+CV_EXPORTS Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
+                                                   Point anchor = Point(-1,-1), int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernelX, InputArray kernelY,
+                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                                   Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void sepFilter2D(InputArray src, OutputArray dst, int ddepth, InputArray kernelX, InputArray kernelY, Point anchor, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createSeparableLinearFilter(src.type(), ddepth, kernelX, kernelY, anchor, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+//! the generalized Deriv operator
+CV_EXPORTS Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
+                                         int ksize, bool normalize = false, double scale = 1,
+                                         int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! the Sobel operator
+CV_EXPORTS Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
+                                         double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+//! the vertical or horizontal Scharr operator
+CV_EXPORTS Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
+                                          double scale = 1, int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Sobel(InputArray src, OutputArray dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
+                                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                             Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void Sobel(InputArray src, OutputArray dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createSobelFilter(src.type(), ddepth, dx, dy, ksize, scale, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void Scharr(InputArray src, OutputArray dst, int ddepth, int dx, int dy, double scale = 1,
+                                              int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                              Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void Scharr(InputArray src, OutputArray dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createScharrFilter(src.type(), ddepth, dx, dy, scale, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+//! smooths the image using Gaussian filter
+CV_EXPORTS Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
+                                            double sigma1, double sigma2 = 0,
+                                            int rowBorderMode = BORDER_DEFAULT, int columnBorderMode = -1);
+
+__OPENCV_GPUFILTERS_DEPR_BEFORE__ void GaussianBlur(InputArray src, OutputArray dst, Size ksize,
+                                                    double sigma1, double sigma2 = 0,
+                                                    int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
+                                                    Stream& stream = Stream::Null()) __OPENCV_GPUFILTERS_DEPR_AFTER__;
+
+inline void GaussianBlur(InputArray src, OutputArray dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
+{
+    Ptr<gpu::Filter> f = gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, rowBorderType, columnBorderType);
+    f->apply(src, dst, stream);
+}
+
+
+
+
+
+
 
 
 
@@ -196,14 +273,7 @@ public:
     virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
 };
 
-//! returns the non-separable filter engine with the specified filter
-CV_EXPORTS Ptr<FilterEngine_GPU> createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType);
 
-//! returns the separable filter engine with the specified filters
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf);
 
 //! returns horizontal 1D box filter
 //! supports only CV_8UC1 source type and CV_32FC1 sum type
@@ -230,47 +300,7 @@ CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, co
 
 
 
-//! returns the primitive row filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when srcType == CV_8UC1 or srcType == CV_8UC4 and bufType == srcType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
 
-//! returns the primitive column filter with the specified kernel.
-//! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 dst type.
-//! there are two version of algorithm: NPP and OpenCV.
-//! NPP calls when dstType == CV_8UC1 or dstType == CV_8UC4 and bufType == dstType,
-//! otherwise calls OpenCV version.
-//! NPP supports only BORDER_CONSTANT border type.
-//! OpenCV version supports only CV_32F as buffer depth and
-//! BORDER_REFLECT101, BORDER_REPLICATE and BORDER_CONSTANT border types.
-CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel,
-    int anchor = -1, int borderType = BORDER_DEFAULT);
-
-//! returns the separable linear filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel,
-    const Mat& columnKernel, GpuMat& buf, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT,
-    int columnBorderType = -1);
-
-//! returns filter engine for the generalized Sobel operator
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf,
-                                                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-
-//! returns the Gaussian filter engine
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                                                          int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
 
 //! returns maximum filter
 CV_EXPORTS Ptr<BaseFilter_GPU> getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1));
@@ -297,30 +327,8 @@ CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat&
 CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2,
                              Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
 
-//! applies separable 2D linear filter to the image
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                            Point anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1,
-                            Stream& stream = Stream::Null());
 
-//! applies generalized Sobel operator to the image
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize = 3, double scale = 1,
-                      int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
 
-//! applies the vertical or horizontal Scharr operator to the image
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale = 1,
-                       int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
-
-//! smooths the image using Gaussian filter.
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1);
-CV_EXPORTS void GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2 = 0,
-                             int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1, Stream& stream = Stream::Null());
 
 }} // namespace cv { namespace gpu {
 
diff --git a/modules/gpufilters/perf/perf_filters.cpp b/modules/gpufilters/perf/perf_filters.cpp
index efa99696a..9c7dcb8b4 100644
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -159,13 +159,6 @@ PERF_TEST_P(Sz_Type_KernelSz, Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(C
     }
 }
 
-
-
-
-
-
-
-
 //////////////////////////////////////////////////////////////////////
 // Sobel
 
@@ -184,9 +177,10 @@ PERF_TEST_P(Sz_Type_KernelSz, Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);
+        cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(d_src.type(), -1, 1, 1, ksize);
+
+        TEST_CYCLE() sobel->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -217,9 +211,10 @@ PERF_TEST_P(Sz_Type, Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);
+        cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(d_src.type(), -1, 1, 0);
+
+        TEST_CYCLE() scharr->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -251,9 +246,10 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     {
         const cv::gpu::GpuMat d_src(src);
         cv::gpu::GpuMat dst;
-        cv::gpu::GpuMat d_buf;
 
-        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE() gauss->apply(d_src, dst);
 
         GPU_SANITY_CHECK(dst);
     }
@@ -267,6 +263,13 @@ PERF_TEST_P(Sz_Type_KernelSz, GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Value
     }
 }
 
+
+
+
+
+
+
+
 //////////////////////////////////////////////////////////////////////
 // Erode
 
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index 305ecec23..c4de57440 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -53,22 +53,24 @@ Ptr<Filter> cv::gpu::createLinearFilter(int, int, InputArray, Point, int, Scalar
 
 Ptr<Filter> cv::gpu::createLaplacianFilter(int, int, int, double, int, Scalar) { throw_no_cuda(); return Ptr<Filter>(); }
 
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int, int, InputArray, InputArray, Point, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createDerivFilter(int, int, int, int, int, bool, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createSobelFilter(int, int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+Ptr<Filter> cv::gpu::createScharrFilter(int, int, int, int, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+Ptr<Filter> cv::gpu::createGaussianFilter(int, int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<Filter>(); }
+
+
+
+
+
+
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_no_cuda(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, GpuMat&, const Point&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int, GpuMat&, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, GpuMat&, double, double, int, int) { throw_no_cuda(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int, int, const Size&, Point) { throw_no_cuda(); return Ptr<BaseFilter_GPU>(0); }
 
@@ -78,14 +80,7 @@ void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_no_
 void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_no_cuda(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_no_cuda(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, GpuMat&, int, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double, int, int) { throw_no_cuda(); }
-void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, GpuMat&, double, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double, int, int) { throw_no_cuda(); }
-void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, GpuMat&, double, double, int, int, Stream&) { throw_no_cuda(); }
+
 
 #else
 
@@ -185,6 +180,8 @@ Ptr<Filter> cv::gpu::createBoxFilter(int srcType, int dstType, Size ksize, Point
     if (dstType < 0)
         dstType = srcType;
 
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
     return new NPPBoxFilter(srcType, dstType, ksize, anchor, borderMode, borderVal);
 }
 
@@ -291,6 +288,8 @@ Ptr<Filter> cv::gpu::createLinearFilter(int srcType, int dstType, InputArray ker
     if (dstType < 0)
         dstType = srcType;
 
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
     return new LinearFilter(srcType, dstType, kernel, anchor, borderMode, borderVal);
 }
 
@@ -314,7 +313,198 @@ Ptr<Filter> cv::gpu::createLaplacianFilter(int srcType, int dstType, int ksize,
     return gpu::createLinearFilter(srcType, dstType, kernel, Point(-1,-1), borderMode, borderVal);
 }
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Separable Linear Filter
 
+namespace filter
+{
+    template <typename T, typename D>
+    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+namespace
+{
+    class SeparableLinearFilter : public Filter
+    {
+    public:
+        SeparableLinearFilter(int srcType, int dstType,
+                              InputArray rowKernel, InputArray columnKernel,
+                              Point anchor, int rowBorderMode, int columnBorderMode);
+
+        void apply(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+
+        int srcType_, bufType_, dstType_;
+        GpuMat rowKernel_, columnKernel_;
+        func_t rowFilter_, columnFilter_;
+        Point anchor_;
+        int rowBorderMode_, columnBorderMode_;
+
+        GpuMat buf_;
+    };
+
+    SeparableLinearFilter::SeparableLinearFilter(int srcType, int dstType,
+                                                 InputArray _rowKernel, InputArray _columnKernel,
+                                                 Point anchor, int rowBorderMode, int columnBorderMode) :
+        srcType_(srcType), dstType_(dstType), anchor_(anchor), rowBorderMode_(rowBorderMode), columnBorderMode_(columnBorderMode)
+    {
+        static const func_t rowFilterFuncs[7][4] =
+        {
+            {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+            {0, 0, 0, 0},
+            {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
+            {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
+            {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
+            {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+            {0, 0, 0, 0}
+        };
+
+        static const func_t columnFilterFuncs[7][4] =
+        {
+            {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+            {0, 0, 0, 0},
+            {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
+            {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
+            {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
+            {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+            {0, 0, 0, 0}
+        };
+
+        const int sdepth = CV_MAT_DEPTH(srcType);
+        const int cn = CV_MAT_CN(srcType);
+        const int ddepth = CV_MAT_DEPTH(dstType);
+
+        Mat rowKernel = _rowKernel.getMat();
+        Mat columnKernel = _columnKernel.getMat();
+
+        CV_Assert( sdepth <= CV_64F && cn <= 4 );
+        CV_Assert( rowKernel.channels() == 1 );
+        CV_Assert( columnKernel.channels() == 1 );
+        CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP );
+        CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP );
+
+        Mat kernel32F;
+
+        rowKernel.convertTo(kernel32F, CV_32F);
+        rowKernel_.upload(kernel32F.reshape(1, 1));
+
+        columnKernel.convertTo(kernel32F, CV_32F);
+        columnKernel_.upload(kernel32F.reshape(1, 1));
+
+        CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 );
+        CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 );
+
+        normalizeAnchor(anchor_.x, rowKernel_.cols);
+        normalizeAnchor(anchor_.y, columnKernel_.cols);
+
+        bufType_ = CV_MAKE_TYPE(CV_32F, cn);
+
+        rowFilter_ = rowFilterFuncs[sdepth][cn - 1];
+        CV_Assert( rowFilter_ != 0 );
+
+        columnFilter_ = columnFilterFuncs[ddepth][cn - 1];
+        CV_Assert( columnFilter_ != 0 );
+    }
+
+    void SeparableLinearFilter::apply(InputArray _src, OutputArray _dst, Stream& _stream)
+    {
+        GpuMat src = _src.getGpuMat();
+        CV_Assert( src.type() == srcType_ );
+
+        _dst.create(src.size(), dstType_);
+        GpuMat dst = _dst.getGpuMat();
+
+        ensureSizeIsEnough(src.size(), bufType_, buf_);
+
+        DeviceInfo devInfo;
+        const int cc = devInfo.major() * 10 + devInfo.minor();
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
+        columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
+    }
+}
+
+Ptr<Filter> cv::gpu::createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel, Point anchor, int rowBorderMode, int columnBorderMode)
+{
+    if (dstType < 0)
+        dstType = srcType;
+
+    dstType = CV_MAKE_TYPE(CV_MAT_DEPTH(dstType), CV_MAT_CN(srcType));
+
+    if (columnBorderMode < 0)
+        columnBorderMode = rowBorderMode;
+
+    return new SeparableLinearFilter(srcType, dstType, rowKernel, columnKernel, anchor, rowBorderMode, columnBorderMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Deriv Filter
+
+Ptr<Filter> cv::gpu::createDerivFilter(int srcType, int dstType, int dx, int dy, int ksize, bool normalize, double scale, int rowBorderMode, int columnBorderMode)
+{
+    Mat kx, ky;
+    getDerivKernels(kx, ky, dx, dy, ksize, normalize, CV_32F);
+
+    if (scale != 1)
+    {
+        // usually the smoothing part is the slowest to compute,
+        // so try to scale it instead of the faster differenciating part
+        if (dx == 0)
+            kx *= scale;
+        else
+            ky *= scale;
+    }
+
+    return gpu::createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1, -1), rowBorderMode, columnBorderMode);
+}
+
+Ptr<Filter> cv::gpu::createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize, double scale, int rowBorderMode, int columnBorderMode)
+{
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, ksize, false, scale, rowBorderMode, columnBorderMode);
+}
+
+Ptr<Filter> cv::gpu::createScharrFilter(int srcType, int dstType, int dx, int dy, double scale, int rowBorderMode, int columnBorderMode)
+{
+    return gpu::createDerivFilter(srcType, dstType, dx, dy, -1, false, scale, rowBorderMode, columnBorderMode);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Gaussian Filter
+
+Ptr<Filter> cv::gpu::createGaussianFilter(int srcType, int dstType, Size ksize, double sigma1, double sigma2, int rowBorderMode, int columnBorderMode)
+{
+    const int depth = CV_MAT_DEPTH(srcType);
+
+    if (sigma2 <= 0)
+        sigma2 = sigma1;
+
+    // automatic detection of kernel size from sigma
+    if (ksize.width <= 0 && sigma1 > 0)
+        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+    if (ksize.height <= 0 && sigma2 > 0)
+        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
+
+    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
+
+    sigma1 = std::max(sigma1, 0.0);
+    sigma2 = std::max(sigma2, 0.0);
+
+    Mat kx = getGaussianKernel(ksize.width, sigma1, CV_32F);
+    Mat ky;
+    if (ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON)
+        ky = kx;
+    else
+        ky = getGaussianKernel(ksize.height, sigma2, CV_32F);
+
+    return createSeparableLinearFilter(srcType, dstType, kx, ky, Point(-1,-1), rowBorderMode, columnBorderMode);
+}
 
 
 
@@ -377,128 +567,6 @@ namespace
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Filter2D
-
-namespace
-{
-    struct Filter2DEngine_GPU : public FilterEngine_GPU
-    {
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
-            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
-        {}
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            normalizeROI(roi, filter2D->ksize, filter2D->anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-
-            (*filter2D)(srcROI, dstROI, stream);
-        }
-
-        Ptr<BaseFilter_GPU> filter2D;
-        int srcType, dstType;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new Filter2DEngine_GPU(filter2D, srcType, dstType));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// SeparableFilter
-
-namespace
-{
-    struct SeparableFilterEngine_GPU : public FilterEngine_GPU
-    {
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf;
-        }
-
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
-                                  int srcType_, int bufType_, int dstType_,
-                                  GpuMat& buf_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_),
-            srcType(srcType_), bufType(bufType_), dstType(dstType_)
-        {
-            ksize = Size(rowFilter->ksize, columnFilter->ksize);
-            anchor = Point(rowFilter->anchor, columnFilter->anchor);
-
-            pbuf = &buf_;
-        }
-
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null())
-        {
-            CV_Assert(src.type() == srcType);
-
-            Size src_size = src.size();
-
-            dst.create(src_size, dstType);
-
-            if (roi.size() != src_size)
-            {
-                dst.setTo(Scalar::all(0), stream);
-            }
-
-            ensureSizeIsEnough(src_size, bufType, *pbuf);
-
-            normalizeROI(roi, ksize, anchor, src_size);
-
-            GpuMat srcROI = src(roi);
-            GpuMat dstROI = dst(roi);
-            GpuMat bufROI = (*pbuf)(roi);
-
-            (*rowFilter)(srcROI, bufROI, stream);
-            (*columnFilter)(bufROI, dstROI, stream);
-        }
-
-        Ptr<BaseRowFilter_GPU> rowFilter;
-        Ptr<BaseColumnFilter_GPU> columnFilter;
-
-        int srcType, bufType, dstType;
-
-        Size ksize;
-        Point anchor;
-
-        GpuMat buf;
-        GpuMat* pbuf;
-    };
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
-{
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // 1D Sum Filter
 
@@ -829,433 +897,6 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Separable Linear Filter
-
-namespace filter
-{
-    template <typename T, typename D>
-    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-namespace
-{
-    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
-        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
-
-    typedef void (*gpuFilter1D_t)(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-
-    struct NppLinearRowFilter : public BaseRowFilter_GPU
-    {
-        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearRowFilter : public BaseRowFilter_GPU
-    {
-        GpuLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
-
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
-        {0, 0, 0, 0},
-        {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>},
-        {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>},
-        {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>},
-        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
-    };
-
-    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
-
-        GpuMat gpu_row_krnl;
-        int nDivisor;
-        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
-
-        const int ksize = gpu_row_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor, npp_funcs[CV_MAT_CN(srcType)]));
-    }
-
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int sdepth = CV_MAT_DEPTH(srcType);
-    const int cn = CV_MAT_CN(srcType);
-    CV_Assert( sdepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    const gpuFilter1D_t func = funcs[sdepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_row_krnl;
-    normalizeKernel(rowKernel, gpu_row_krnl, CV_32F);
-
-    const int ksize = gpu_row_krnl.cols;
-    CV_Assert( ksize > 0 && ksize <= 32 );
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, gpu_row_krnl, func, borderType));
-}
-
-namespace
-{
-    struct NppLinearColumnFilter : public BaseColumnFilter_GPU
-    {
-        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            cudaStream_t stream = StreamAccessor::getStream(s);
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
-                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        GpuMat kernel;
-        Npp32s nDivisor;
-        nppFilter1D_t func;
-    };
-
-    struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
-    {
-        GpuLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, gpuFilter1D_t func_, int brd_type_) :
-            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}
-
-        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
-        {
-            DeviceInfo devInfo;
-            int cc = devInfo.major() * 10 + devInfo.minor();
-            if (ksize > 16 && cc < 20)
-                CV_Error(cv::Error::StsNotImplemented, "column linear filter doesn't implemented for kernel size > 16 for device with compute capabilities less than 2.0");
-
-            func(src, dst, kernel.ptr<float>(), ksize, anchor, brd_type, cc, StreamAccessor::getStream(s));
-        }
-
-        GpuMat kernel;
-        gpuFilter1D_t func;
-        int brd_type;
-    };
-}
-
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
-{
-    static const gpuFilter1D_t funcs[7][4] =
-    {
-        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
-        {0, 0, 0, 0},
-        {filter::linearColumn<float, ushort>, 0, filter::linearColumn<float3, ushort3>, filter::linearColumn<float4, ushort4>},
-        {filter::linearColumn<float, short>, 0, filter::linearColumn<float3, short3>, filter::linearColumn<float4, short4>},
-        {filter::linearColumn<float, int>, 0, filter::linearColumn<float3, int3>, filter::linearColumn<float4, int4>},
-        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
-        {0, 0, 0, 0}
-    };
-    static const nppFilter1D_t npp_funcs[] =
-    {
-        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
-    };
-
-    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
-    {
-        CV_Assert( borderType == BORDER_CONSTANT );
-
-        GpuMat gpu_col_krnl;
-        int nDivisor;
-        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
-
-        const int ksize = gpu_col_krnl.cols;
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, npp_funcs[CV_MAT_CN(bufType)]));
-    }
-
-    CV_Assert( borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP );
-
-    const int ddepth = CV_MAT_DEPTH(dstType);
-    const int cn = CV_MAT_CN(dstType);
-    CV_Assert( ddepth <= CV_64F && cn <= 4 );
-    CV_Assert( CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(bufType) == cn );
-
-    gpuFilter1D_t func = funcs[ddepth][cn - 1];
-    CV_Assert( func != 0 );
-
-    GpuMat gpu_col_krnl;
-    normalizeKernel(columnKernel, gpu_col_krnl, CV_32F);
-
-    const int ksize = gpu_col_krnl.cols;
-    CV_Assert(ksize > 0 && ksize <= 32);
-
-    normalizeAnchor(anchor, ksize);
-
-    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, gpu_col_krnl, func, borderType));
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
-    const Point& anchor, int rowBorderType, int columnBorderType)
-{
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
-
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
-
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
-    const Point& anchor, int rowBorderType, int columnBorderType)
-{
-    if (columnBorderType < 0)
-        columnBorderType = rowBorderType;
-
-    int cn = CV_MAT_CN(srcType);
-    int bdepth = CV_32F;
-    int bufType = CV_MAKETYPE(bdepth, cn);
-
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, rowBorderType);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, columnBorderType);
-
-    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
-                          Point anchor, int rowBorderType, int columnBorderType)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
-}
-
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
-                          Point anchor, int rowBorderType, int columnBorderType,
-                          Stream& stream)
-{
-    if( ddepth < 0 )
-        ddepth = src.depth();
-
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, buf, anchor, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Deriv Filter
-
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType, int columnBorderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, GpuMat& buf, int rowBorderType, int columnBorderType)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale, int rowBorderType, int columnBorderType)
-{
-    GpuMat buf;
-    Sobel(src, dst, ddepth, dx, dy, buf, ksize, scale, rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-
-    if (scale != 1)
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if (dx == 0)
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
-}
-
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale, int rowBorderType, int columnBorderType)
-{
-    GpuMat buf;
-    Scharr(src, dst, ddepth, dx, dy, buf, scale, rowBorderType, columnBorderType);
-}
-
-void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    Mat kx, ky;
-    getDerivKernels(kx, ky, dx, dy, -1, false, CV_32F);
-
-    if( scale != 1 )
-    {
-        // usually the smoothing part is the slowest to compute,
-        // so try to scale it instead of the faster differenciating part
-        if( dx == 0 )
-            kx *= scale;
-        else
-            ky *= scale;
-    }
-
-    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Gaussian Filter
-
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    int depth = CV_MAT_DEPTH(type);
-
-    if (sigma2 <= 0)
-        sigma2 = sigma1;
-
-    // automatic detection of kernel size from sigma
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
-
-    sigma1 = std::max(sigma1, 0.0);
-    sigma2 = std::max(sigma2, 0.0);
-
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
-    Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    int depth = CV_MAT_DEPTH(type);
-
-    if (sigma2 <= 0)
-        sigma2 = sigma1;
-
-    // automatic detection of kernel size from sigma
-    if (ksize.width <= 0 && sigma1 > 0)
-        ksize.width = cvRound(sigma1 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-    if (ksize.height <= 0 && sigma2 > 0)
-        ksize.height = cvRound(sigma2 * (depth == CV_8U ? 3 : 4)*2 + 1) | 1;
-
-    CV_Assert( ksize.width > 0 && ksize.width % 2 == 1 && ksize.height > 0 && ksize.height % 2 == 1 );
-
-    sigma1 = std::max(sigma1, 0.0);
-    sigma2 = std::max(sigma2, 0.0);
-
-    Mat kx = getGaussianKernel( ksize.width, sigma1, std::max(depth, CV_32F) );
-    Mat ky;
-    if( ksize.height == ksize.width && std::abs(sigma1 - sigma2) < DBL_EPSILON )
-        ky = kx;
-    else
-        ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
-
-    return createSeparableLinearFilter_GPU(type, type, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType);
-}
-
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{
-    if (ksize.width == 1 && ksize.height == 1)
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
-}
-
-void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType, Stream& stream)
-{
-    if (ksize.width == 1 && ksize.height == 1)
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    dst.create(src.size(), src.type());
-
-    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
-    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Image Rank Filter
 
diff --git a/modules/gpufilters/test/test_filters.cpp b/modules/gpufilters/test/test_filters.cpp
index 42018424d..e052ede6c 100644
--- a/modules/gpufilters/test/test_filters.cpp
+++ b/modules/gpufilters/test/test_filters.cpp
@@ -215,13 +215,74 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, Laplacian, testing::Combine(
     testing::Values(KSize(cv::Size(1, 1)), KSize(cv::Size(3, 3))),
     WHOLE_SUBMAT));
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// SeparableLinearFilter
 
+PARAM_TEST_CASE(SeparableLinearFilter, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, KSize, Anchor, BorderType, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int depth;
+    int cn;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    bool useRoi;
 
+    int type;
 
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        cn = GET_PARAM(3);
+        ksize = GET_PARAM(4);
+        anchor = GET_PARAM(5);
+        borderType = GET_PARAM(6);
+        useRoi = GET_PARAM(7);
 
+        cv::gpu::setDevice(devInfo.deviceID());
 
+        type = CV_MAKE_TYPE(depth, cn);
+    }
+};
 
+GPU_TEST_P(SeparableLinearFilter, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Mat rowKernel = randomMat(Size(ksize.width, 1), CV_32FC1, 0.0, 1.0);
+    cv::Mat columnKernel = randomMat(Size(ksize.height, 1), CV_32FC1, 0.0, 1.0);
 
+    cv::Ptr<cv::gpu::Filter> filter = cv::gpu::createSeparableLinearFilter(src.type(), -1, rowKernel, columnKernel, anchor, borderType);
+
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    filter->apply(loadMat(src, useRoi), dst);
+
+    cv::Mat dst_gold;
+    cv::sepFilter2D(src, dst_gold, -1, rowKernel, columnKernel, anchor, 0, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Filters, SeparableLinearFilter, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(7, 7)),
+                    KSize(cv::Size(13, 13)),
+                    KSize(cv::Size(15, 15)),
+                    KSize(cv::Size(17, 17)),
+                    KSize(cv::Size(23, 15)),
+                    KSize(cv::Size(31, 3))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Sobel
@@ -265,13 +326,15 @@ GPU_TEST_P(Sobel, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> sobel = cv::gpu::createSobelFilter(src.type(), -1, dx, dy, ksize.width, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Sobel(loadMat(src, useRoi), dst, -1, dx, dy, ksize.width, 1.0, borderType);
+    sobel->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Sobel(src, dst_gold, -1, dx, dy, ksize.width, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Sobel, testing::Combine(
@@ -328,13 +391,15 @@ GPU_TEST_P(Scharr, Accuracy)
 
     cv::Mat src = randomMat(size, type);
 
+    cv::Ptr<cv::gpu::Filter> scharr = cv::gpu::createScharrFilter(src.type(), -1, dx, dy, 1.0, borderType);
+
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::Scharr(loadMat(src, useRoi), dst, -1, dx, dy, 1.0, borderType);
+    scharr->apply(loadMat(src, useRoi), dst);
 
     cv::Mat dst_gold;
     cv::Scharr(src, dst_gold, -1, dx, dy, 1.0, 0.0, borderType);
 
-    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 0.0 : 0.1);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, Scharr, testing::Combine(
@@ -387,28 +452,15 @@ GPU_TEST_P(GaussianBlur, Accuracy)
     double sigma1 = randomDouble(0.1, 1.0);
     double sigma2 = randomDouble(0.1, 1.0);
 
-    if (ksize.height > 16 && !supportFeature(devInfo, cv::gpu::FEATURE_SET_COMPUTE_20))
-    {
-        try
-        {
-            cv::gpu::GpuMat dst;
-            cv::gpu::GaussianBlur(loadMat(src), dst, ksize, sigma1, sigma2, borderType);
-        }
-        catch (const cv::Exception& e)
-        {
-            ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
-        }
-    }
-    else
-    {
-        cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-        cv::gpu::GaussianBlur(loadMat(src, useRoi), dst, ksize, sigma1, sigma2, borderType);
+    cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(src.type(), -1, ksize, sigma1, sigma2, borderType);
 
-        cv::Mat dst_gold;
-        cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+    cv::gpu::GpuMat dst = createMat(size, type, useRoi);
+    gauss->apply(loadMat(src, useRoi), dst);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 4.0);
-    }
+    cv::Mat dst_gold;
+    cv::GaussianBlur(src, dst_gold, ksize, sigma1, sigma2, borderType);
+
+    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() < CV_32F ? 4.0 : 1e-4);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
@@ -437,6 +489,15 @@ INSTANTIATE_TEST_CASE_P(GPU_Filters, GaussianBlur, testing::Combine(
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
+
+
+
+
+
+
+
+
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Erode
 
diff --git a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
index cf1b8e670..3fe9f82f4 100644
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -158,7 +158,7 @@ struct CV_EXPORTS CannyBuf
     GpuMat mag;
     GpuMat map;
     GpuMat st1, st2;
-    Ptr<FilterEngine_GPU> filterDX, filterDY;
+    Ptr<Filter> filterDX, filterDY;
 };
 
 CV_EXPORTS void Canny(const GpuMat& image, GpuMat& edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false);
diff --git a/modules/gpuimgproc/src/canny.cpp b/modules/gpuimgproc/src/canny.cpp
index 8d361fe50..9a3357564 100644
--- a/modules/gpuimgproc/src/canny.cpp
+++ b/modules/gpuimgproc/src/canny.cpp
@@ -65,8 +65,8 @@ void cv::gpu::CannyBuf::create(const Size& image_size, int apperture_size)
 
         if (apperture_size != 3)
         {
-            filterDX = createDerivFilter_GPU(CV_8UC1, CV_32S, 1, 0, apperture_size, BORDER_REPLICATE);
-            filterDY = createDerivFilter_GPU(CV_8UC1, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE);
+            filterDX = createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size, false, 1, BORDER_REPLICATE);
+            filterDY = createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size, false, 1, BORDER_REPLICATE);
         }
     }
 
@@ -150,8 +150,8 @@ void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_th
     }
     else
     {
-        buf.filterDX->apply(src, buf.dx, Rect(0, 0, src.cols, src.rows));
-        buf.filterDY->apply(src, buf.dy, Rect(0, 0, src.cols, src.rows));
+        buf.filterDX->apply(src, buf.dx);
+        buf.filterDY->apply(src, buf.dy);
 
         calcMagnitude(buf.dx, buf.dy, buf.mag, L2gradient);
     }
diff --git a/modules/gpuimgproc/src/corners.cpp b/modules/gpuimgproc/src/corners.cpp
index 44dc1505d..824a3308e 100644
--- a/modules/gpuimgproc/src/corners.cpp
+++ b/modules/gpuimgproc/src/corners.cpp
@@ -70,6 +70,8 @@ namespace
 {
     void extractCovData(const GpuMat& src, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
     {
+        (void) buf;
+
         double scale = static_cast<double>(1 << ((ksize > 0 ? ksize : 3) - 1)) * blockSize;
 
         if (ksize < 0)
@@ -83,16 +85,21 @@ namespace
         Dx.create(src.size(), CV_32F);
         Dy.create(src.size(), CV_32F);
 
+        Ptr<gpu::Filter> filterDx, filterDy;
+
         if (ksize > 0)
         {
-            Sobel(src, Dx, CV_32F, 1, 0, buf, ksize, scale, borderType, -1, stream);
-            Sobel(src, Dy, CV_32F, 0, 1, buf, ksize, scale, borderType, -1, stream);
+            filterDx = gpu::createSobelFilter(src.type(), CV_32F, 1, 0, ksize, scale, borderType);
+            filterDy = gpu::createSobelFilter(src.type(), CV_32F, 0, 1, ksize, scale, borderType);
         }
         else
         {
-            Scharr(src, Dx, CV_32F, 1, 0, buf, scale, borderType, -1, stream);
-            Scharr(src, Dy, CV_32F, 0, 1, buf, scale, borderType, -1, stream);
+            filterDx = gpu::createScharrFilter(src.type(), CV_32F, 1, 0, scale, borderType);
+            filterDy = gpu::createScharrFilter(src.type(), CV_32F, 0, 1, scale, borderType);
         }
+
+        filterDx->apply(src, Dx);
+        filterDy->apply(src, Dy);
     }
 }
 
diff --git a/modules/superres/src/btv_l1_gpu.cpp b/modules/superres/src/btv_l1_gpu.cpp
index 6813187c4..7b2ad7370 100644
--- a/modules/superres/src/btv_l1_gpu.cpp
+++ b/modules/superres/src/btv_l1_gpu.cpp
@@ -230,7 +230,7 @@ namespace
         Ptr<DenseOpticalFlowExt> opticalFlow_;
 
     private:
-        std::vector<Ptr<FilterEngine_GPU> > filters_;
+        std::vector<Ptr<gpu::Filter> > filters_;
         int curBlurKernelSize_;
         double curBlurSigma_;
         int curSrcType_;
@@ -299,7 +299,7 @@ namespace
         {
             filters_.resize(src.size());
             for (size_t i = 0; i < src.size(); ++i)
-                filters_[i] = createGaussianFilter_GPU(src[0].type(), Size(blurKernelSize_, blurKernelSize_), blurSigma_);
+                filters_[i] = gpu::createGaussianFilter(src[0].type(), -1, Size(blurKernelSize_, blurKernelSize_), blurSigma_);
             curBlurKernelSize_ = blurKernelSize_;
             curBlurSigma_ = blurSigma_;
             curSrcType_ = src[0].type();
@@ -346,7 +346,7 @@ namespace
                 // a = M * Ih
                 gpu::remap(highRes_, a_[k], backwardMaps_[k].first, backwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
                 // b = HM * Ih
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // c = DHF * Ih
                 gpu::resize(b_[k], c_[k], lowResSize, 0, 0, INTER_NEAREST, streams_[k]);
 
@@ -355,7 +355,7 @@ namespace
                 // a = Dt * diff
                 upscale(c_[k], a_[k], scale_, streams_[k]);
                 // b = HtDt * diff
-                filters_[k]->apply(a_[k], b_[k], Rect(0,0,-1,-1), streams_[k]);
+                filters_[k]->apply(a_[k], b_[k], streams_[k]);
                 // diffTerm = MtHtDt * diff
                 gpu::remap(b_[k], diffTerms_[k], forwardMaps_[k].first, forwardMaps_[k].second, INTER_NEAREST, BORDER_REPLICATE, Scalar(), streams_[k]);
             }
diff --git a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
index 87b525599..1815cc6de 100644
--- a/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
+++ b/samples/cpp/tutorial_code/gpu/gpu-basics-similarity/gpu-basics-similarity.cpp
@@ -308,6 +308,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
     gpu::split(tmp2, vI2);
     Scalar mssim;
 
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(vI2[0].type(), -1, Size(11, 11), 1.5);
+
     for( int i = 0; i < gI1.channels(); ++i )
     {
         gpu::GpuMat I2_2, I1_2, I1_I2;
@@ -318,8 +320,8 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         /*************************** END INITS **********************************/
         gpu::GpuMat mu1, mu2;   // PRELIMINARY COMPUTING
-        gpu::GaussianBlur(vI1[i], mu1, Size(11, 11), 1.5);
-        gpu::GaussianBlur(vI2[i], mu2, Size(11, 11), 1.5);
+        gauss->apply(vI1[i], mu1);
+        gauss->apply(vI2[i], mu2);
 
         gpu::GpuMat mu1_2, mu2_2, mu1_mu2;
         gpu::multiply(mu1, mu1, mu1_2);
@@ -328,13 +330,13 @@ Scalar getMSSIM_GPU( const Mat& i1, const Mat& i2)
 
         gpu::GpuMat sigma1_2, sigma2_2, sigma12;
 
-        gpu::GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
+        gauss->apply(I1_2, sigma1_2);
         gpu::subtract(sigma1_2, mu1_2, sigma1_2); // sigma1_2 -= mu1_2;
 
-        gpu::GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
+        gauss->apply(I2_2, sigma2_2);
         gpu::subtract(sigma2_2, mu2_2, sigma2_2); // sigma2_2 -= mu2_2;
 
-        gpu::GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
+        gauss->apply(I1_I2, sigma12);
         gpu::subtract(sigma12, mu1_mu2, sigma12); // sigma12 -= mu1_mu2;
 
         ///////////////////////////////// FORMULA ////////////////////////////////
@@ -375,7 +377,7 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
     gpu::split(b.t2, b.vI2, stream);
     Scalar mssim;
 
-    gpu::GpuMat buf;
+    Ptr<gpu::Filter> gauss = gpu::createGaussianFilter(b.vI1[0].type(), -1, Size(11, 11), 1.5);
 
     for( int i = 0; i < b.gI1.channels(); ++i )
     {
@@ -383,22 +385,22 @@ Scalar getMSSIM_GPU_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
         gpu::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
         gpu::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
 
-        gpu::GaussianBlur(b.vI1[i], b.mu1, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
-        gpu::GaussianBlur(b.vI2[i], b.mu2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.vI1[i], b.mu1, stream);
+        gauss->apply(b.vI2[i], b.mu2, stream);
 
         gpu::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
         gpu::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
         gpu::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
 
-        gpu::GaussianBlur(b.I1_2, b.sigma1_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_2, b.sigma1_2, stream);
         gpu::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, gpu::GpuMat(), -1, stream);
         //b.sigma1_2 -= b.mu1_2;  - This would result in an extra data transfer operation
 
-        gpu::GaussianBlur(b.I2_2, b.sigma2_2, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I2_2, b.sigma2_2, stream);
         gpu::subtract(b.sigma2_2, b.mu2_2, b.sigma2_2, gpu::GpuMat(), -1, stream);
         //b.sigma2_2 -= b.mu2_2;
 
-        gpu::GaussianBlur(b.I1_I2, b.sigma12, Size(11, 11), buf, 1.5, 0, BORDER_DEFAULT, -1, stream);
+        gauss->apply(b.I1_I2, b.sigma12, stream);
         gpu::subtract(b.sigma12, b.mu1_mu2, b.sigma12, gpu::GpuMat(), -1, stream);
         //b.sigma12 -= b.mu1_mu2;
 
diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp
index f6ace6d77..cbe50a40f 100644
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -929,10 +929,12 @@ TEST(GaussianBlur)
         gpu::GpuMat d_dst(src.size(), src.type());
         gpu::GpuMat d_buf;
 
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        cv::Ptr<cv::gpu::Filter> gauss = cv::gpu::createGaussianFilter(d_src.type(), -1, cv::Size(3, 3), 1);
+
+        gauss->apply(d_src, d_dst);
 
         GPU_ON;
-        gpu::GaussianBlur(d_src, d_dst, Size(3, 3), d_buf, 1);
+        gauss->apply(d_src, d_dst);
         GPU_OFF;
     }
 }