diff --git a/modules/gpu/doc/image_filtering.rst b/modules/gpu/doc/image_filtering.rst
index 7b21fc353..2e674cf4f 100644
--- a/modules/gpu/doc/image_filtering.rst
+++ b/modules/gpu/doc/image_filtering.rst
@@ -395,7 +395,7 @@ Applies the non-separable 2D linear filter to an image.
 
 .. ocv:function:: void gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null())
 
-    :param src: Source image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` and ``CV_32FC1``  source types are supported.
 
     :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
 
diff --git a/modules/gpu/perf/perf_filters.cpp b/modules/gpu/perf/perf_filters.cpp
index f6ba4a9d3..932255728 100644
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -102,8 +102,8 @@ GPU_PERF_TEST(LinearFilter, cv::gpu::DeviceInfo, cv::Size, perf::MatType, int)
 INSTANTIATE_TEST_CASE_P(Filter, LinearFilter, testing::Combine(
                         ALL_DEVICES, 
                         GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4),
-                        testing::Values(3, 5)));
+                        testing::Values(CV_8UC1, CV_8UC4, CV_32FC1),
+                        testing::Values(3, 5, 7, 9)));
 
 //////////////////////////////////////////////////////////////////////
 // SeparableLinearFilter
diff --git a/modules/gpu/perf/perf_imgproc.cpp b/modules/gpu/perf/perf_imgproc.cpp
index 1418d09ba..8c31d83fb 100644
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -727,11 +727,12 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Dft, testing::Combine(
 //////////////////////////////////////////////////////////////////////
 // Convolve
 
-GPU_PERF_TEST(Convolve, cv::gpu::DeviceInfo, cv::Size, int)
+GPU_PERF_TEST(Convolve, cv::gpu::DeviceInfo, cv::Size, int, bool)
 {
     cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
     cv::Size size = GET_PARAM(1);
     int templ_size = GET_PARAM(2);
+    bool ccorr = GET_PARAM(3);
 
     cv::gpu::setDevice(devInfo.deviceID());
 
@@ -748,14 +749,15 @@ GPU_PERF_TEST(Convolve, cv::gpu::DeviceInfo, cv::Size, int)
 
     TEST_CYCLE()
     {
-        cv::gpu::convolve(image, templ, dst, false, buf);
+        cv::gpu::convolve(image, templ, dst, ccorr, buf);
     }
 }
 
 INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, testing::Combine(
                         ALL_DEVICES, 
                         GPU_TYPICAL_MAT_SIZES,
-                        testing::Values(3, 9, 27, 32, 64)));
+                        testing::Values(3, 9, 27, 32, 64),
+                        testing::Bool()));
 
 //////////////////////////////////////////////////////////////////////
 // PyrDown
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 72053ad3b..1a302f455 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -904,79 +904,49 @@ namespace cv { namespace gpu { namespace device
                 cudaSafeCall(cudaDeviceSynchronize());
         }
 
-
         //////////////////////////////////////////////////////////////////////////
-        // convolve
+        // filter2D
 
-        #define CONVOLVE_MAX_KERNEL_SIZE 17
+        #define FILTER2D_MAX_KERNEL_SIZE 16
 
-        __constant__ float c_convolveKernel[CONVOLVE_MAX_KERNEL_SIZE * CONVOLVE_MAX_KERNEL_SIZE];
+        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
 
-        __global__ void convolve(const DevMem2Df src, PtrStepf dst, int kWidth, int kHeight)
+        texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeBorder);
+
+        __global__ void filter2D(int ofsX, int ofsY, DevMem2Df dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
         {
-            __shared__ float smem[16 + 2 * 8][16 + 2 * 8];
-
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-            // x | x 0 | 0
-            // -----------
-            // x | x 0 | 0
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-            smem[threadIdx.y][threadIdx.x] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
+            if (x >= dst.cols || y >= dst.rows)
+                return;
 
-            // 0 | 0 x | x
-            // -----------
-            // 0 | 0 x | x
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-            smem[threadIdx.y][threadIdx.x + 16] = src.ptr(::min(::max(y - 8, 0), src.rows - 1))[::min(x + 8, src.cols - 1)];
+            float res = 0;
 
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-            // x | x 0 | 0
-            // -----------
-            // x | x 0 | 0
-            smem[threadIdx.y + 16][threadIdx.x] = src.ptr(::min(y + 8, src.rows - 1))[::min(::max(x - 8, 0), src.cols - 1)];
+            const int baseX = ofsX + x - anchorX;
+            const int baseY = ofsY + y - anchorY;
 
-            // 0 | 0 0 | 0
-            // -----------
-            // 0 | 0 0 | 0
-            // 0 | 0 x | x
-            // -----------
-            // 0 | 0 x | x
-            smem[threadIdx.y + 16][threadIdx.x + 16] = src.ptr(::min(y + 8, src.rows - 1))[::min(x + 8, src.cols - 1)];
+            int kInd = 0;
 
-            __syncthreads();
-
-            if (x < src.cols && y < src.rows)
+            for (int i = 0; i < kHeight; ++i)
             {
-                float res = 0;
-
-                for (int i = 0; i < kHeight; ++i)
-                {
-                    for (int j = 0; j < kWidth; ++j)
-                    {
-                        res += smem[threadIdx.y + 8 - kHeight / 2 + i][threadIdx.x + 8 - kWidth / 2 + j] * c_convolveKernel[i * kWidth + j];
-                    }
-                }
-
-                dst.ptr(y)[x] = res;
+                for (int j = 0; j < kWidth; ++j)
+                    res += tex2D(filter2DTex, baseX + j, baseY + i) * c_filter2DKernel[kInd++];
             }
+
+            dst.ptr(y)[x] = res;
         }
 
-        void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream)
+        void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream)
         {
-            cudaSafeCall(cudaMemcpyToSymbol(c_convolveKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+            cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
 
             const dim3 block(16, 16);
-            const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
 
-            convolve<<<grid, block, 0, stream>>>(src, dst, kWidth, kHeight);
+            bindTexture(&filter2DTex, src);
+
+            filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY);
             cudaSafeCall(cudaGetLastError());
 
             if (stream == 0)
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index e9977c6d3..7af32c1af 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -659,6 +659,14 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear Filter
 
+namespace cv { namespace gpu { namespace device 
+{
+    namespace imgproc
+    {
+        void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream);
+    }
+}}}
+
 namespace
 {
     typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, 
@@ -696,20 +704,56 @@ namespace
         Npp32s nDivisor;
         nppFilter2D_t func;
     };
+
+    struct GpuLinearFilter : public BaseFilter_GPU
+    {
+        GpuLinearFilter(Size ksize_, Point anchor_, const GpuMat& kernel_) : 
+            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_) {}
+            
+        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+        {
+            using namespace cv::gpu::device::imgproc;
+
+            Point ofs;
+            Size wholeSize;
+            src.locateROI(wholeSize, ofs);
+            GpuMat srcWhole(wholeSize, src.type(), src.datastart);
+
+            filter2D_gpu(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), StreamAccessor::getStream(stream));
+        }
+
+        GpuMat kernel;
+    };
 }
 
 Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, Point anchor)
 {
-    static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
+    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1);
+    CV_Assert(dstType == srcType);
 
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
+    if (srcType == CV_32FC1)
+    {
+        CV_Assert(ksize.width * ksize.height <= 16 * 16);
+
+        GpuMat gpu_krnl;
+        normalizeKernel(kernel, gpu_krnl, CV_32F);
+
+        normalizeAnchor(anchor, ksize);
+
+        return Ptr<BaseFilter_GPU>(new GpuLinearFilter(ksize, anchor, gpu_krnl));
+    }
+    else
+    {
+        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
     
-    GpuMat gpu_krnl;
-    int nDivisor;
-    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
-    normalizeAnchor(anchor, ksize);
+        GpuMat gpu_krnl;
+        int nDivisor;
+        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
 
-    return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
+        normalizeAnchor(anchor, ksize);
+
+        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
+    }    
 }    
 
 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor)
@@ -729,7 +773,8 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
     dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
 
     Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor);
-    f->apply(src, dst, Rect(0, 0, -1, -1), stream);
+
+    f->apply(src, dst, src.type() == CV_32FC1 ? Rect(0, 0, src.cols, src.rows) : Rect(0, 0, -1, -1), stream);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpu/src/imgproc.cpp b/modules/gpu/src/imgproc.cpp
index 205c5b2ed..7cb631755 100644
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -1673,137 +1673,82 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
     convolve(image, templ, result, ccorr, buf);
 }
 
-namespace cv { namespace gpu { namespace device 
-{
-    namespace imgproc
-    {
-        void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
-    }
-}}}
-
 void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
 {
     using namespace ::cv::gpu::device::imgproc;
 
 #ifndef HAVE_CUFFT
-
-    CV_Assert(image.type() == CV_32F);
-    CV_Assert(templ.type() == CV_32F);
-    CV_Assert(templ.cols <= 17 && templ.rows <= 17);
-    
-    result.create(image.size(), CV_32F);
-
-    GpuMat& contKernel = buf.templ_block;
-
-    if (templ.isContinuous())
-        contKernel = templ;
-    else
-    {
-        contKernel = createContinuous(templ.size(), templ.type());
-
-        if (stream)
-            stream.enqueueCopy(templ, contKernel);
-        else
-            templ.copyTo(contKernel);
-    }
-
-    convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
-
+    throw_nogpu();
 #else
-
     StaticAssert<sizeof(float) == sizeof(cufftReal)>::check();
     StaticAssert<sizeof(float) * 2 == sizeof(cufftComplex)>::check();
 
     CV_Assert(image.type() == CV_32F);
     CV_Assert(templ.type() == CV_32F);
 
-    if (templ.cols < 13 && templ.rows < 13)
+    buf.create(image.size(), templ.size());
+    result.create(buf.result_size, CV_32F);
+
+    Size& block_size = buf.block_size;
+    Size& dft_size = buf.dft_size;
+
+    GpuMat& image_block = buf.image_block;
+    GpuMat& templ_block = buf.templ_block;
+    GpuMat& result_data = buf.result_data;
+
+    GpuMat& image_spect = buf.image_spect;
+    GpuMat& templ_spect = buf.templ_spect;
+    GpuMat& result_spect = buf.result_spect;
+
+    cufftHandle planR2C, planC2R;
+    cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
+    cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
+
+    cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
+    cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
+
+    GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
+    copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
+                   templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
+
+    cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
+                               templ_spect.ptr<cufftComplex>()));
+
+    // Process all blocks of the result matrix
+    for (int y = 0; y < result.rows; y += block_size.height)
     {
-        result.create(image.size(), CV_32F);
-
-        GpuMat& contKernel = buf.templ_block;
-
-        if (templ.isContinuous())
-            contKernel = templ;
-        else
+        for (int x = 0; x < result.cols; x += block_size.width)
         {
-            contKernel = createContinuous(templ.size(), templ.type());
+            Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
+                                std::min(y + dft_size.height, image.rows) - y);
+            GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), 
+                             image.step);
+            copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
+                           0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
+
+            cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
+                                       image_spect.ptr<cufftComplex>()));
+            mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
+                                 1.f / dft_size.area(), ccorr, stream);
+            cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
+                                       result_data.ptr<cufftReal>()));
+
+            Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
+                                 std::min(y + block_size.height, result.rows) - y);
+            GpuMat result_roi(result_roi_size, result.type(), 
+                              (void*)(result.ptr<float>(y) + x), result.step);
+            GpuMat result_block(result_roi_size, result_data.type(), 
+                                result_data.ptr(), result_data.step);
 
             if (stream)
-                stream.enqueueCopy(templ, contKernel);
+                stream.enqueueCopy(result_block, result_roi);
             else
-                templ.copyTo(contKernel);
+                result_block.copyTo(result_roi);
         }
-
-        convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
-    }
-    else
-    {
-        buf.create(image.size(), templ.size());
-        result.create(buf.result_size, CV_32F);
-
-        Size& block_size = buf.block_size;
-        Size& dft_size = buf.dft_size;
-
-        GpuMat& image_block = buf.image_block;
-        GpuMat& templ_block = buf.templ_block;
-        GpuMat& result_data = buf.result_data;
-
-        GpuMat& image_spect = buf.image_spect;
-        GpuMat& templ_spect = buf.templ_spect;
-        GpuMat& result_spect = buf.result_spect;
-
-        cufftHandle planR2C, planC2R;
-        cufftSafeCall(cufftPlan2d(&planC2R, dft_size.height, dft_size.width, CUFFT_C2R));
-        cufftSafeCall(cufftPlan2d(&planR2C, dft_size.height, dft_size.width, CUFFT_R2C));
-
-        cufftSafeCall( cufftSetStream(planR2C, StreamAccessor::getStream(stream)) );
-        cufftSafeCall( cufftSetStream(planC2R, StreamAccessor::getStream(stream)) );
-
-        GpuMat templ_roi(templ.size(), CV_32F, templ.data, templ.step);
-        copyMakeBorder(templ_roi, templ_block, 0, templ_block.rows - templ_roi.rows, 0, 
-                       templ_block.cols - templ_roi.cols, 0, Scalar(), stream);
-
-        cufftSafeCall(cufftExecR2C(planR2C, templ_block.ptr<cufftReal>(), 
-                                   templ_spect.ptr<cufftComplex>()));
-
-        // Process all blocks of the result matrix
-        for (int y = 0; y < result.rows; y += block_size.height)
-        {
-            for (int x = 0; x < result.cols; x += block_size.width)
-            {
-                Size image_roi_size(std::min(x + dft_size.width, image.cols) - x,
-                                    std::min(y + dft_size.height, image.rows) - y);
-                GpuMat image_roi(image_roi_size, CV_32F, (void*)(image.ptr<float>(y) + x), 
-                                 image.step);
-                copyMakeBorder(image_roi, image_block, 0, image_block.rows - image_roi.rows,
-                               0, image_block.cols - image_roi.cols, 0, Scalar(), stream);
-
-                cufftSafeCall(cufftExecR2C(planR2C, image_block.ptr<cufftReal>(), 
-                                           image_spect.ptr<cufftComplex>()));
-                mulAndScaleSpectrums(image_spect, templ_spect, result_spect, 0,
-                                     1.f / dft_size.area(), ccorr, stream);
-                cufftSafeCall(cufftExecC2R(planC2R, result_spect.ptr<cufftComplex>(), 
-                                           result_data.ptr<cufftReal>()));
-
-                Size result_roi_size(std::min(x + block_size.width, result.cols) - x,
-                                     std::min(y + block_size.height, result.rows) - y);
-                GpuMat result_roi(result_roi_size, result.type(), 
-                                  (void*)(result.ptr<float>(y) + x), result.step);
-                GpuMat result_block(result_roi_size, result_data.type(), 
-                                    result_data.ptr(), result_data.step);
-
-                if (stream)
-                    stream.enqueueCopy(result_block, result_roi);
-                else
-                    result_block.copyTo(result_roi);
-            }
-        }
-
-        cufftSafeCall(cufftDestroy(planR2C));
-        cufftSafeCall(cufftDestroy(planC2R));
     }
 
+    cufftSafeCall(cufftDestroy(planR2C));
+    cufftSafeCall(cufftDestroy(planC2R));
 #endif
 }
 
diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp
index d5c668f63..58b473a9e 100644
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -629,4 +629,94 @@ INSTANTIATE_TEST_CASE_P(Filter, MorphEx, Combine(
                         Values((int)cv::MORPH_OPEN, (int)cv::MORPH_CLOSE, (int)cv::MORPH_GRADIENT, (int)cv::MORPH_TOPHAT, (int)cv::MORPH_BLACKHAT),
                         USE_ROI));
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// filter2D
+
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, int, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    int ksize;
+    bool useRoi;
+
+    cv::Mat img;
+    cv::Mat kernel;
+    
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        ksize = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+        
+        img = readImage("stereobp/aloe-L.png");
+        ASSERT_FALSE(img.empty());
+
+        kernel = cv::Mat::ones(ksize, ksize, CV_32FC1);
+    }
+};
+
+TEST_P(Filter2D, Rgba)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, CV_BGR2BGRA);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, cv::Point(-1, -1), 0, cv::BORDER_CONSTANT);
+
+    cv::Mat dst;
+
+    cv::gpu::GpuMat dev_dst;
+
+    cv::gpu::filter2D(loadMat(src, useRoi), dev_dst, -1, kernel);
+
+    dev_dst.download(dst);
+
+    EXPECT_MAT_NEAR_KSIZE(dst_gold, dst, ksize, 0.0);
+}
+
+TEST_P(Filter2D, Gray)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, CV_BGR2GRAY);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, cv::Point(-1, -1), 0, cv::BORDER_CONSTANT);
+
+    cv::Mat dst;
+
+    cv::gpu::GpuMat dev_dst;
+
+    cv::gpu::filter2D(loadMat(src, useRoi), dev_dst, -1, kernel);
+
+    dev_dst.download(dst);
+
+    EXPECT_MAT_NEAR_KSIZE(dst_gold, dst, ksize, 0.0);
+}
+
+TEST_P(Filter2D, 32FC1)
+{
+    cv::Mat src;
+    cv::cvtColor(img, src, CV_BGR2GRAY);
+    src.convertTo(src, CV_32F, 1.0 / 255.0);
+
+    cv::Mat dst_gold;
+    cv::filter2D(src, dst_gold, -1, kernel, cv::Point(-1, -1), 0, cv::BORDER_CONSTANT);
+
+    cv::Mat dst;
+
+    cv::gpu::GpuMat dev_dst;
+
+    cv::gpu::filter2D(loadMat(src, useRoi), dev_dst, -1, kernel);
+
+    dev_dst.download(dst);
+
+    EXPECT_MAT_NEAR_KSIZE(dst_gold, dst, ksize, 1e-3);
+}
+
+INSTANTIATE_TEST_CASE_P(Filter, Filter2D, Combine(
+                        ALL_DEVICES,
+                        Values(3, 5, 7, 11, 13, 15),
+                        USE_ROI));
+
 #endif // HAVE_CUDA
diff --git a/modules/gpu/test/test_imgproc.cpp b/modules/gpu/test/test_imgproc.cpp
index f25cb36ae..018a3e62b 100644
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -2573,36 +2573,36 @@ INSTANTIATE_TEST_CASE_P(ImgProc, EqualizeHist, ALL_DEVICES);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // cornerHarris
 
-PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, Border)
+PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, Border, int, int)
 {
     cv::gpu::DeviceInfo devInfo;
     int type;
     int borderType;
+    int blockSize;
+    int apertureSize;
 
     cv::Mat src;
-    int blockSize;
-    int apertureSize;        
     double k;
 
     cv::Mat dst_gold;
-    
+
     virtual void SetUp()
     {
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
         borderType = GET_PARAM(2);
+        blockSize = GET_PARAM(3);
+        apertureSize = GET_PARAM(4); 
 
         cv::gpu::setDevice(devInfo.deviceID());
-    
+
         cv::RNG& rng = TS::ptr()->get_rng();
-        
+
         cv::Mat img = readImage("stereobm/aloe-L.png", CV_LOAD_IMAGE_GRAYSCALE);
         ASSERT_FALSE(img.empty());
-        
+
         img.convertTo(src, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-        
-        blockSize = 1 + rng.next() % 5;
-        apertureSize = 1 + 2 * (rng.next() % 4);        
+
         k = rng.uniform(0.1, 0.9);
 
         cv::cornerHarris(src, dst_gold, blockSize, apertureSize, k, borderType);
@@ -2612,7 +2612,7 @@ PARAM_TEST_CASE(CornerHarris, cv::gpu::DeviceInfo, MatType, Border)
 TEST_P(CornerHarris, Accuracy)
 {
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat dev_dst;
 
     cv::gpu::cornerHarris(loadMat(src), dev_dst, blockSize, apertureSize, k, borderType);
@@ -2625,21 +2625,23 @@ TEST_P(CornerHarris, Accuracy)
 INSTANTIATE_TEST_CASE_P(ImgProc, CornerHarris, Combine(
                         ALL_DEVICES, 
                         Values(CV_8UC1, CV_32FC1), 
-                        Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT)));
+                        Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT),
+                        Values(3, 5, 7),
+                        Values(0, 3, 5, 7)));
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // cornerMinEigen
 
-PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, Border)
+PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, Border, int, int)
 {
     cv::gpu::DeviceInfo devInfo;
     int type;
     int borderType;
-
-    cv::Mat src;
     int blockSize;
     int apertureSize;
 
+    cv::Mat src;
+
     cv::Mat dst_gold;
     
     virtual void SetUp()
@@ -2647,18 +2649,17 @@ PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, Border)
         devInfo = GET_PARAM(0);
         type = GET_PARAM(1);
         borderType = GET_PARAM(2);
+        blockSize = GET_PARAM(3);
+        apertureSize = GET_PARAM(4); 
+
+        cv::gpu::setDevice(devInfo.deviceID());
 
-        cv::gpu::setDevice(devInfo.deviceID());        
-    
         cv::RNG& rng = TS::ptr()->get_rng();
-        
+
         cv::Mat img = readImage("stereobm/aloe-L.png", CV_LOAD_IMAGE_GRAYSCALE);
         ASSERT_FALSE(img.empty());
 
         img.convertTo(src, type, type == CV_32F ? 1.0 / 255.0 : 1.0);
-        
-        blockSize = 1 + rng.next() % 5;
-        apertureSize = 1 + 2 * (rng.next() % 4);
 
         cv::cornerMinEigenVal(src, dst_gold, blockSize, apertureSize, borderType);
     }
@@ -2667,7 +2668,7 @@ PARAM_TEST_CASE(CornerMinEigen, cv::gpu::DeviceInfo, MatType, Border)
 TEST_P(CornerMinEigen, Accuracy)
 {
     cv::Mat dst;
-    
+
     cv::gpu::GpuMat dev_dst;
 
     cv::gpu::cornerMinEigenVal(loadMat(src), dev_dst, blockSize, apertureSize, borderType);
@@ -2680,7 +2681,9 @@ TEST_P(CornerMinEigen, Accuracy)
 INSTANTIATE_TEST_CASE_P(ImgProc, CornerMinEigen, Combine(
                         ALL_DEVICES, 
                         Values(CV_8UC1, CV_32FC1), 
-                        Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT)));
+                        Values((int) cv::BORDER_REFLECT101, (int) cv::BORDER_REPLICATE, (int) cv::BORDER_REFLECT),
+                        Values(3, 5, 7),
+                        Values(0, 3, 5, 7)));
 
 ////////////////////////////////////////////////////////////////////////
 // ColumnSum
@@ -3641,12 +3644,54 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Canny, testing::Combine(
 ////////////////////////////////////////////////////////
 // convolve
 
-PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, int)
+namespace
+{
+    void convolveDFT(const cv::Mat& A, const cv::Mat& B, cv::Mat& C, bool ccorr = false)
+    {
+        // reallocate the output array if needed
+        C.create(std::abs(A.rows - B.rows) + 1, std::abs(A.cols - B.cols) + 1, A.type());
+        Size dftSize;
+
+        // compute the size of DFT transform
+        dftSize.width = cv::getOptimalDFTSize(A.cols + B.cols - 1);
+        dftSize.height = cv::getOptimalDFTSize(A.rows + B.rows - 1);
+
+        // allocate temporary buffers and initialize them with 0�s
+        cv::Mat tempA(dftSize, A.type(), cv::Scalar::all(0));
+        cv::Mat tempB(dftSize, B.type(), cv::Scalar::all(0));
+
+        // copy A and B to the top-left corners of tempA and tempB, respectively
+        cv::Mat roiA(tempA, cv::Rect(0, 0, A.cols, A.rows));
+        A.copyTo(roiA);
+        cv::Mat roiB(tempB, cv::Rect(0, 0, B.cols, B.rows));
+        B.copyTo(roiB);
+
+        // now transform the padded A & B in-place;
+        // use "nonzeroRows" hint for faster processing
+        cv::dft(tempA, tempA, 0, A.rows);
+        cv::dft(tempB, tempB, 0, B.rows);
+
+        // multiply the spectrums;
+        // the function handles packed spectrum representations well
+        cv::mulSpectrums(tempA, tempB, tempA, 0, ccorr);
+
+        // transform the product back from the frequency domain.
+        // Even though all the result rows will be non-zero,
+        // you need only the first C.rows of them, and thus you
+        // pass nonzeroRows == C.rows
+        cv::dft(tempA, tempA, cv::DFT_INVERSE + cv::DFT_SCALE, C.rows);
+
+        // now copy the result back to C.
+        tempA(cv::Rect(0, 0, C.cols, C.rows)).copyTo(C);
+    }
+}
+
+PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, int, bool)
 {    
     cv::gpu::DeviceInfo devInfo;
     int ksize;
+    bool ccorr;
     
-    cv::Size size;    
     cv::Mat src;
     cv::Mat kernel;
     
@@ -3656,36 +3701,38 @@ PARAM_TEST_CASE(Convolve, cv::gpu::DeviceInfo, int)
     {
         devInfo = GET_PARAM(0);
         ksize = GET_PARAM(1);
+        ccorr = GET_PARAM(2);
 
         cv::gpu::setDevice(devInfo.deviceID());
         
         cv::RNG& rng = TS::ptr()->get_rng();
 
-        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
+        cv::Size size(rng.uniform(200, 400), rng.uniform(200, 400));
 
-        src = randomMat(rng, size, CV_32FC1, 0.0, 255.0, false);
+        src = randomMat(rng, size, CV_32FC1, 0.0, 100.0, false);
         kernel = randomMat(rng, cv::Size(ksize, ksize), CV_32FC1, 0.0, 1.0, false);
         
-        cv::filter2D(src, dst_gold, CV_32F, kernel, cv::Point(-1, -1), 0, cv::BORDER_REPLICATE);
+        convolveDFT(src, kernel, dst_gold, ccorr);
     }
 };
 
 TEST_P(Convolve, Accuracy)
-{    
+{
     cv::Mat dst;
 
     cv::gpu::GpuMat d_dst;
 
-    cv::gpu::convolve(loadMat(src), loadMat(kernel), d_dst);
+    cv::gpu::convolve(loadMat(src), loadMat(kernel), d_dst, ccorr);
 
     d_dst.download(dst);
 
-    EXPECT_MAT_NEAR(dst, dst_gold, 1e-2);
+    EXPECT_MAT_NEAR(dst, dst_gold, 1e-1);
 }
 
 
 INSTANTIATE_TEST_CASE_P(ImgProc, Convolve, Combine(
                         ALL_DEVICES, 
-                        Values(3, 5, 7, 9, 11)));
+                        Values(3, 7, 11, 17, 19, 23, 45),
+                        Bool()));
 
 #endif // HAVE_CUDA