diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index f3e9a8fad..949acba69 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -284,12 +284,11 @@ CV_EXPORTS Ptr<FilterEngine_GPU> createMorphologyFilter_GPU(int op, int type, co
 
 //! returns 2D filter with the specified kernel
 //! supports CV_8UC1 and CV_8UC4 types
-CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize,
-    Point anchor = Point(-1, -1));
+CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT);
 
 //! returns the non-separable linear filter engine
 CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel,
-    const Point& anchor = Point(-1,-1));
+    Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT);
 
 //! returns the primitive row filter with the specified kernel.
 //! supports only CV_8UC1, CV_8UC4, CV_16SC1, CV_16SC2, CV_32SC1, CV_32FC1 source type.
@@ -367,7 +366,7 @@ CV_EXPORTS void morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat&
                              Point anchor = Point(-1, -1), int iterations = 1, Stream& stream = Stream::Null());
 
 //! applies non-separable 2D linear filter to the image
-CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), Stream& stream = Stream::Null());
+CV_EXPORTS void filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
 
 //! applies separable 2D linear filter to the image
 CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
diff --git a/modules/gpu/src/cuda/imgproc.cu b/modules/gpu/src/cuda/imgproc.cu
index 57cec51cb..79c8fa285 100644
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -900,49 +900,100 @@ namespace cv { namespace gpu { namespace device
 
         __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];
 
-        texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void filter2D(int ofsX, int ofsY, PtrStepf dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY, const BrdReflect101<float> brd)
+        template <class SrcT, typename D>
+        __global__ void filter2D(const SrcT src, DevMem2D_<D> dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
         {
+            typedef typename TypeVec<float, VecTraits<D>::cn>::vec_type sum_t;
+
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-            if (x > brd.last_col || y > brd.last_row)
+            if (x >= dst.cols || y >= dst.rows)
                 return;
 
-            float res = 0;
+            sum_t res = VecTraits<sum_t>::all(0);
             int kInd = 0;
 
             for (int i = 0; i < kHeight; ++i)
             {
                 for (int j = 0; j < kWidth; ++j)
-                {
-                    const int srcX = ofsX + brd.idx_col(x - anchorX + j);
-                    const int srcY = ofsY + brd.idx_row(y - anchorY + i);
-
-                    res += tex2D(filter2DTex, srcX, srcY) * c_filter2DKernel[kInd++];
-                }
+                    res = res + src(y - anchorY + i, x - anchorX + j) * c_filter2DKernel[kInd++];
             }
 
-            dst.ptr(y)[x] = res;
+            dst(y, x) = saturate_cast<D>(res);
         }
 
-        void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream)
+        template <typename T, typename D, template <typename> class Brd> struct Filter2DCaller;
+
+        #define IMPLEMENT_FILTER2D_TEX_READER(type) \
+            texture< type , cudaTextureType2D, cudaReadModeElementType> tex_filter2D_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_filter2D_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                const int xoff; \
+                const int yoff; \
+                tex_filter2D_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_filter2D_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <typename D, template <typename> class Brd> struct Filter2DCaller< type , D, Brd> \
+            { \
+                static void call(const DevMem2D_< type > srcWhole, int xoff, int yoff, DevMem2D_<D> dst, \
+                    int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(16, 16); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_filter2D_ ## type , srcWhole); \
+                    tex_filter2D_ ## type ##_reader texSrc(xoff, yoff); \
+                    Brd<work_type> brd(dst.rows, dst.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_filter2D_ ## type ##_reader, Brd<work_type> > brdSrc(texSrc, brd); \
+                    filter2D<<<grid, block, 0, stream>>>(brdSrc, dst, kWidth, kHeight, anchorX, anchorY); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    if (stream == 0) \
+                        cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        IMPLEMENT_FILTER2D_TEX_READER(uchar);
+        IMPLEMENT_FILTER2D_TEX_READER(uchar4);
+
+        IMPLEMENT_FILTER2D_TEX_READER(ushort);
+        IMPLEMENT_FILTER2D_TEX_READER(ushort4);
+
+        IMPLEMENT_FILTER2D_TEX_READER(float);
+        IMPLEMENT_FILTER2D_TEX_READER(float4);
+
+        #undef IMPLEMENT_FILTER2D_TEX_READER
+
+        template <typename T, typename D>
+        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, 
+                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, 
+                          int borderMode, const float* borderValue, cudaStream_t stream)
         {
+            typedef void (*func_t)(const DevMem2D_<T> srcWhole, int xoff, int yoff, DevMem2D_<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+            static const func_t funcs[] = 
+            {
+                Filter2DCaller<T, D, BrdReflect101>::call,
+                Filter2DCaller<T, D, BrdReplicate>::call,
+                Filter2DCaller<T, D, BrdConstant>::call,
+                Filter2DCaller<T, D, BrdReflect>::call,
+                Filter2DCaller<T, D, BrdWrap>::call
+            };
+
             cudaSafeCall(cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
 
-            const dim3 block(16, 16);
-            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            bindTexture(&filter2DTex, src);
-
-            BrdReflect101<float> brd(dst.rows, dst.cols);
-
-            filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY, brd);
-            cudaSafeCall(cudaGetLastError());
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
+            funcs[borderMode](static_cast< DevMem2D_<T> >(srcWhole), ofsX, ofsY, static_cast< DevMem2D_<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
         }
+
+        template void filter2D_gpu<uchar, uchar>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<uchar4, uchar4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<ushort, ushort>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<ushort4, ushort4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<float, float>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void filter2D_gpu<float4, float4>(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
     } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device {
diff --git a/modules/gpu/src/filtering.cpp b/modules/gpu/src/filtering.cpp
index ac61d20df..2349857ee 100644
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -58,8 +58,8 @@ Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const
 Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, GpuMat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, Point, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
@@ -78,7 +78,7 @@ void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog
 void cv::gpu::dilate(const GpuMat&, GpuMat&, const Mat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }
 void cv::gpu::morphologyEx(const GpuMat&, GpuMat&, int, const Mat&, GpuMat&, GpuMat&, Point, int, Stream&) { throw_nogpu(); }
-void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, Stream&) { throw_nogpu(); }
+void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point, int, Stream&) { throw_nogpu(); }
 void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, int, int) { throw_nogpu(); }
 void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, GpuMat&, Point, int, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double, int, int) { throw_nogpu(); }
@@ -663,7 +663,10 @@ namespace cv { namespace gpu { namespace device
 {
     namespace imgproc
     {
-        void filter2D_gpu(DevMem2Df src, int ofsX, int ofsY, DevMem2Df dst, int kWidth, int kHeight, int anchorX, int anchorY, float* kernel, cudaStream_t stream);
+        template <typename T, typename D>
+        void filter2D_gpu(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, 
+                          int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, 
+                          int borderMode, const float* borderValue, cudaStream_t stream);
     }
 }}}
 
@@ -705,10 +708,16 @@ namespace
         nppFilter2D_t func;
     };
 
-    struct GpuLinearFilter : public BaseFilter_GPU
+    typedef void (*gpuFilter2D_t)(DevMem2Db srcWhole, int ofsX, int ofsY, DevMem2Db dst, 
+                                   int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, 
+                                   int borderMode, const float* borderValue, cudaStream_t stream);
+
+    struct GpuFilter2D : public BaseFilter_GPU
     {
-        GpuLinearFilter(Size ksize_, Point anchor_, const GpuMat& kernel_) :
-            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_) {}
+        GpuFilter2D(Size ksize_, Point anchor_, gpuFilter2D_t func_, const GpuMat& kernel_, int brd_type_) :
+            BaseFilter_GPU(ksize_, anchor_), func(func_), kernel(kernel_), brd_type(brd_type_) 
+        {
+        }
 
         virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
         {
@@ -719,30 +728,32 @@ namespace
             src.locateROI(wholeSize, ofs);
             GpuMat srcWhole(wholeSize, src.type(), src.datastart);
 
-            filter2D_gpu(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), StreamAccessor::getStream(stream));
+            static const Scalar_<float> zero = Scalar_<float>::all(0.0f);
+            func(srcWhole, ofs.x, ofs.y, dst, ksize.width, ksize.height, anchor.x, anchor.y, kernel.ptr<float>(), brd_type, zero.val, StreamAccessor::getStream(stream));
         }
 
+        gpuFilter2D_t func;
         GpuMat kernel;
+        int brd_type;
     };
 }
 
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, Point anchor)
+Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int brd_type)
 {
-    CV_Assert(srcType == CV_8UC1 || srcType == CV_8UC4 || srcType == CV_32FC1);
+    using namespace cv::gpu::device::imgproc;
+
+    int sdepth = CV_MAT_DEPTH(srcType);
+    int scn = CV_MAT_CN(srcType);
+
+    CV_Assert(sdepth == CV_8U || sdepth == CV_16U || sdepth == CV_32F);
+    CV_Assert(scn == 1 || scn == 4);
     CV_Assert(dstType == srcType);
+    CV_Assert(brd_type == BORDER_REFLECT101 || brd_type == BORDER_REPLICATE || brd_type == BORDER_CONSTANT || brd_type == BORDER_REFLECT || brd_type == BORDER_WRAP);
 
-    if (srcType == CV_32FC1)
-    {
-        CV_Assert(ksize.width * ksize.height <= 16 * 16);
+    Size ksize = kernel.size();
 
-        GpuMat gpu_krnl;
-        normalizeKernel(kernel, gpu_krnl, CV_32F);
-
-        normalizeAnchor(anchor, ksize);
-
-        return Ptr<BaseFilter_GPU>(new GpuLinearFilter(ksize, anchor, gpu_krnl));
-    }
-    else
+#if 0
+    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && brd_type == BORDER_CONSTANT)
     {
         static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
 
@@ -754,27 +765,64 @@ Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const
 
         return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
     }
+#endif
+
+    CV_Assert(ksize.width * ksize.height <= 16 * 16);
+
+    int gpuBorderType;
+    CV_Assert( tryConvertToGpuBorderType(brd_type, gpuBorderType) );
+
+    GpuMat gpu_krnl;
+    normalizeKernel(kernel, gpu_krnl, CV_32F);
+
+    normalizeAnchor(anchor, ksize);
+
+    gpuFilter2D_t func = 0;
+
+    switch (srcType)
+    {
+    case CV_8UC1:
+        func = filter2D_gpu<uchar, uchar>;
+        break;
+    case CV_8UC4:
+        func = filter2D_gpu<uchar4, uchar4>;
+        break;
+    case CV_16UC1:
+        func = filter2D_gpu<ushort, ushort>;
+        break;
+    case CV_16UC4:
+        func = filter2D_gpu<ushort4, ushort4>;
+        break;
+    case CV_32FC1:
+        func = filter2D_gpu<float, float>;
+        break;
+    case CV_32FC4:
+        func = filter2D_gpu<float4, float4>;
+        break;
+    }
+
+    return Ptr<BaseFilter_GPU>(new GpuFilter2D(ksize, anchor, func, gpu_krnl, gpuBorderType));
 }
 
-Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor)
+Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor, int borderType)
 {
-    Size ksize = kernel.size();
-
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor);
+    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, anchor, borderType);
 
     return createFilter2D_GPU(linearFilter, srcType, dstType);
 }
 
-void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, Stream& stream)
+void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor, int borderType, Stream& stream)
 {
-    if( ddepth < 0 )
+    if (ddepth < 0)
         ddepth = src.depth();
 
-    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
+    int dst_type = CV_MAKE_TYPE(ddepth, src.channels());
 
-    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst.type(), kernel, anchor);
+    Ptr<FilterEngine_GPU> f = createLinearFilter_GPU(src.type(), dst_type, kernel, anchor, borderType);
 
-    f->apply(src, dst, src.type() == CV_32FC1 ? Rect(0, 0, src.cols, src.rows) : Rect(0, 0, -1, -1), stream);
+    dst.create(src.size(), dst_type);
+
+    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpu/test/test_filters.cpp b/modules/gpu/test/test_filters.cpp
index 29c31565f..6f7b8d911 100644
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -505,13 +505,14 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, MorphEx, testing::Combine(
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Filter2D
 
-PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, UseRoi)
+PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor, BorderType, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
     int type;
     cv::Size ksize;
     cv::Point anchor;
+    int borderType;
     bool useRoi;
 
     cv::Mat img;
@@ -524,7 +525,8 @@ PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor,
         type = GET_PARAM(2);
         ksize = GET_PARAM(3);
         anchor = GET_PARAM(4);
-        useRoi = GET_PARAM(5);
+        borderType = GET_PARAM(5);
+        useRoi = GET_PARAM(6);
 
         cv::gpu::setDevice(devInfo.deviceID());
     }
@@ -533,26 +535,24 @@ PARAM_TEST_CASE(Filter2D, cv::gpu::DeviceInfo, cv::Size, MatType, KSize, Anchor,
 TEST_P(Filter2D, Accuracy)
 {
     cv::Mat src = randomMat(size, type);
-    cv::Mat kernel = cv::Mat::ones(ksize.height, ksize.width, CV_32FC1);
+    cv::Mat kernel = randomMat(cv::Size(ksize.width, ksize.height), CV_32FC1, 0.0, 1.0);
 
     cv::gpu::GpuMat dst = createMat(size, type, useRoi);
-    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor);
+    cv::gpu::filter2D(loadMat(src, useRoi), dst, -1, kernel, anchor, borderType);
 
     cv::Mat dst_gold;
-    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, type == CV_32FC1 ? cv::BORDER_DEFAULT : cv::BORDER_CONSTANT);
+    cv::filter2D(src, dst_gold, -1, kernel, anchor, 0, borderType);
 
-    if (type == CV_32FC1)
-        EXPECT_MAT_NEAR(dst_gold, dst, 1e-1);
-    else
-        EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), 0.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
 }
 
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
-    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1)),
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
     testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
     testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
 } // namespace