Merge pull request #679 from jet47:gpu-simd-functions

2013-03-21 14:59:55 +04:00
parent 1b10699996 33ff3d6016
commit d756de176f
3 changed files with 1144 additions and 823 deletions
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -48,6 +48,7 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/simd_functions.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -154,170 +155,28 @@ namespace arithm

 namespace arithm
 {
-    template <typename T, typename D> struct VAdd4;
-    template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
+    struct VAdd4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vadd4(a, b);
        }

        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
-    };
-    template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
-    };
-    template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
-    };
-    template <> struct VAdd4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
+        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct VAdd2;
-    template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
+    struct VAdd2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vadd2(a, b);
        }

        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
-    };
-    template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
-    };
-    template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
-    };
-    template <> struct VAdd2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
+        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
    };

    ////////////////////////////////////
@@ -336,13 +195,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAdd4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAdd2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAdd2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -355,28 +214,16 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T, typename D>
-    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void addMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAdd4(), WithOutMask(), stream);
    }

-    template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void addMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAdd2(), WithOutMask(), stream);
    }

-    template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template <typename T, typename D>
    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
@@ -543,170 +390,28 @@ namespace arithm

 namespace arithm
 {
-    template <typename T, typename D> struct VSub4;
-    template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
+    struct VSub4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vsub4(a, b);
        }

        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
-    };
-    template <> struct VSub4<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
-    };
-    template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
-    };
-    template <> struct VSub4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
+        __device__ __forceinline__ VSub4(const VSub4& other) {}
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct VSub2;
-    template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
+    struct VSub2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vsub2(a, b);
        }

        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
-    };
-    template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
-    {
-        __device__ __forceinline__ int operator ()(uint a, uint b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
-    };
-    template <> struct VSub2<int, uint> : binary_function<int, int, uint>
-    {
-        __device__ __forceinline__ uint operator ()(int a, int b) const
-        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
-    };
-    template <> struct VSub2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
+        __device__ __forceinline__ VSub2(const VSub2& other) {}
    };

    ////////////////////////////////////
@@ -725,13 +430,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VSub4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VSub2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VSub2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -744,28 +449,16 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T, typename D>
-    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void subMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VSub4(), WithOutMask(), stream);
    }

-    template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void subMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VSub2(), WithOutMask(), stream);
    }

-    template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template <typename T, typename D>
    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
    {
@@ -1496,90 +1189,28 @@ namespace arithm

 namespace arithm
 {
-    template <typename T, typename D> struct VAbsDiff4;
-    template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
+    struct VAbsDiff4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vabsdiff4(a, b);
        }

        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
-    };
-    template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct VAbsDiff2;
-    template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
+    struct VAbsDiff2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vabsdiff2(a, b);
        }

        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
-    };
-    template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
    };

    ////////////////////////////////////
@@ -1611,13 +1242,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff4<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T, typename D> struct TransformFunctorTraits< arithm::VAbsDiff2<T, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    template <> struct TransformFunctorTraits< arithm::VAbsDiff2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -1630,24 +1261,16 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T>
-    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void absDiffMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAbsDiff4(), WithOutMask(), stream);
    }

-    template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void absDiffMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VAbsDiff2(), WithOutMask(), stream);
    }

-    template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template <typename T>
    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
@@ -1877,6 +1500,49 @@ namespace arithm

 namespace arithm
 {
+    struct VCmpEq4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpeq4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpEq4() {}
+        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+    };
+    struct VCmpNe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmpne4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpNe4() {}
+        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+    };
+    struct VCmpLt4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmplt4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpLt4() {}
+        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+    };
+    struct VCmpLe4 : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            return vcmple4(a, b);
+        }
+
+        __device__ __forceinline__ VCmpLe4() {}
+        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+    };
+
+    ////////////////////////////////////
+
    template <class Op, typename T>
    struct Cmp : binary_function<T, T, uchar>
    {
@@ -1890,6 +1556,21 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
+    template <> struct TransformFunctorTraits< arithm::VCmpEq4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpNe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLt4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+    template <> struct TransformFunctorTraits< arithm::VCmpLe4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
+    {
+    };
+
+    ////////////////////////////////////
+
    template <class Op, typename T> struct TransformFunctorTraits< arithm::Cmp<Op, T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
    {
    };
@@ -1897,6 +1578,23 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
+    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpEq4(), WithOutMask(), stream);
+    }
+    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpNe4(), WithOutMask(), stream);
+    }
+    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpLt4(), WithOutMask(), stream);
+    }
+    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, VCmpLe4(), WithOutMask(), stream);
+    }
+
    template <template <typename> class Op, typename T>
    void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
    {
@@ -2303,44 +2001,11 @@ namespace arithm

 namespace arithm
 {
-    template <typename T> struct VMin4;
-    template <> struct VMin4<uint> : binary_function<uint, uint, uint>
+    struct VMin4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
-    };
-    template <> struct VMin4<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmin4(a, b);
        }

        __device__ __forceinline__ VMin4() {}
@@ -2349,40 +2014,11 @@ namespace arithm

    ////////////////////////////////////

-    template <typename T> struct VMin2;
-    template <> struct VMin2<uint> : binary_function<uint, uint, uint>
+    struct VMin2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
-    };
-    template <> struct VMin2<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmin2(a, b);
        }

        __device__ __forceinline__ VMin2() {}
@@ -2392,13 +2028,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> struct TransformFunctorTraits< arithm::VMin4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMin4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T> struct TransformFunctorTraits< arithm::VMin2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMin2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -2415,14 +2051,14 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void minMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMin4(), WithOutMask(), stream);
    }

-    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void minMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMin2(), WithOutMask(), stream);
    }

    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2430,12 +2066,6 @@ namespace arithm
        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
    }

-    template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -2463,44 +2093,11 @@ namespace arithm

 namespace arithm
 {
-    template <typename T> struct VMax4;
-    template <> struct VMax4<uint> : binary_function<uint, uint, uint>
+    struct VMax4 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
-    };
-    template <> struct VMax4<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmax4(a, b);
        }

        __device__ __forceinline__ VMax4() {}
@@ -2509,40 +2106,11 @@ namespace arithm

    ////////////////////////////////////

-    template <typename T> struct VMax2;
-    template <> struct VMax2<uint> : binary_function<uint, uint, uint>
+    struct VMax2 : binary_function<uint, uint, uint>
    {
        __device__ __forceinline__ uint operator ()(uint a, uint b) const
        {
-            uint res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
-        }
-
-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
-    };
-    template <> struct VMax2<int> : binary_function<int, int, int>
-    {
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            int res = 0;
-
-        #if __CUDA_ARCH__ >= 300
-            asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #elif __CUDA_ARCH__ >= 200
-            asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-            asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
-        #endif
-
-            return res;
+            return vmax2(a, b);
        }

        __device__ __forceinline__ VMax2() {}
@@ -2552,13 +2120,13 @@ namespace arithm

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> struct TransformFunctorTraits< arithm::VMax4<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMax4 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

    ////////////////////////////////////

-    template <typename T> struct TransformFunctorTraits< arithm::VMax2<T> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(T)>
+    template <> struct TransformFunctorTraits< arithm::VMax2 > : arithm::ArithmFuncTraits<sizeof(uint), sizeof(uint)>
    {
    };

@@ -2575,14 +2143,14 @@ namespace cv { namespace gpu { namespace device

 namespace arithm
 {
-    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void maxMat_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMax4(), WithOutMask(), stream);
    }

-    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    void maxMat_v2(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream)
    {
-        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
+        transform(src1, src2, dst, VMax2(), WithOutMask(), stream);
    }

    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
@@ -2590,12 +2158,6 @@ namespace arithm
        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
    }

-    template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -263,11 +263,8 @@ namespace

 namespace arithm
 {
-    template <typename T, typename D>
-    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void addMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void addMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);

    template <typename T, typename D>
    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -345,62 +342,6 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
        }
    };

-    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const vfunc_t vfuncs4[4][4] =
-    {
-        {
-            vadd4<unsigned int, unsigned int>,
-            vadd4<unsigned int, int>,
-            0,
-            0
-        },
-        {
-            vadd4<int, unsigned int>,
-            vadd4<int, int>,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        }
-    };
-    static const vfunc_t vfuncs2[4][4] =
-    {
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            vadd2<unsigned int, unsigned int>,
-            vadd2<unsigned int, int>
-        },
-        {
-            0,
-            0,
-            vadd2<int, unsigned int>,
-            vadd2<int, int>
-        }
-    };
-
    if (dtype < 0)
        dtype = src1.depth();

@@ -426,7 +367,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -434,31 +375,27 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
-            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                addMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                addMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
@@ -606,11 +543,8 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat

 namespace arithm
 {
-    template <typename T, typename D>
-    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void subMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void subMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);

    template <typename T, typename D>
    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -688,62 +622,6 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
        }
    };

-    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const vfunc_t vfuncs4[4][4] =
-    {
-        {
-            vsub4<unsigned int, unsigned int>,
-            vsub4<unsigned int, int>,
-            0,
-            0
-        },
-        {
-            vsub4<int, unsigned int>,
-            vsub4<int, int>,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        }
-    };
-    static const vfunc_t vfuncs2[4][4] =
-    {
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            0,
-            0
-        },
-        {
-            0,
-            0,
-            vsub2<unsigned int, unsigned int>,
-            vsub2<unsigned int, int>
-        },
-        {
-            0,
-            0,
-            vsub2<int, unsigned int>,
-            vsub2<int, int>
-        }
-    };
-
    if (dtype < 0)
        dtype = src1.depth();

@@ -769,7 +647,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
+    if (mask.empty() && (sdepth == CV_8U || sdepth == CV_16U) && ddepth == sdepth)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -777,31 +655,27 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
-            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (sdepth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                subMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (sdepth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                subMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
@@ -1585,11 +1459,8 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St

 namespace arithm
 {
-    template <typename T>
-    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void absDiffMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void absDiffMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);

    template <typename T>
    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1610,20 +1481,6 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
        absDiffMat<float>,
        absDiffMat<double>
    };
-    static const func_t vfuncs4[] =
-    {
-        vabsDiff4<unsigned int>,
-        vabsDiff4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vabsDiff2<unsigned int>,
-        vabsDiff2<int>
-    };

    const int depth = src1.depth();
    const int cn = src1.channels();
@@ -1645,7 +1502,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -1653,31 +1510,27 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                absDiffMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                              stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                absDiffMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                              PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                              stream);

                return;
            }
@@ -1940,6 +1793,11 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)

 namespace arithm
 {
+    void cmpMatEq_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatNe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatLt_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    void cmpMatLe_v4(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+
    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
@@ -1962,6 +1820,12 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
    };

+    typedef void (*func_v4_t)(PtrStepSz<uint> src1, PtrStepSz<uint> src2, PtrStepSz<uint> dst, cudaStream_t stream);
+    static const func_v4_t funcs_v4[] =
+    {
+        cmpMatEq_v4, cmpMatNe_v4, cmpMatLt_v4, cmpMatLe_v4
+    };
+
    const int depth = src1.depth();
    const int cn = src1.channels();

@@ -1997,6 +1861,27 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
    PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

+    if (depth == CV_8U && (src1_.cols & 3) == 0)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (isAllAligned)
+        {
+            const int vcols = src1_.cols >> 2;
+
+            funcs_v4[code](PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                           PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                           stream);
+
+            return;
+        }
+    }
+
    const func_t func = funcs[depth][code];

    func(src1_, src2_, dst_, stream);
@@ -2532,13 +2417,13 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st

 namespace arithm
 {
-    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void minMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void minMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);

-    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void maxMat_v4(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
+    void maxMat_v2(PtrStepSz<unsigned int> src1, PtrStepSz<unsigned int> src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
@@ -2558,20 +2443,6 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
        minMat<float>,
        minMat<double>
    };
-    static const func_t vfuncs4[] =
-    {
-        vmin4<unsigned int>,
-        vmin4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vmin2<unsigned int>,
-        vmin2<int>
-    };

    const int depth = src1.depth();
    const int cn = src1.channels();
@@ -2593,7 +2464,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2601,31 +2472,27 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                minMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                minMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
@@ -2655,20 +2522,6 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
        maxMat<float>,
        maxMat<double>
    };
-    static const func_t vfuncs4[] =
-    {
-        vmax4<unsigned int>,
-        vmax4<int>,
-        0,
-        0
-    };
-    static const func_t vfuncs2[] =
-    {
-        0,
-        0,
-        vmax2<unsigned int>,
-        vmax2<int>
-    };

    const int depth = src1.depth();
    const int cn = src1.channels();
@@ -2690,7 +2543,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s
    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);

-    if (depth < CV_32S)
+    if (depth == CV_8U || depth == CV_16U)
    {
        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
@@ -2698,31 +2551,27 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s

        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;

-        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        if (isAllAligned)
        {
-            const func_t vfunc4 = vfuncs4[depth];
-            const func_t vfunc2 = vfuncs2[depth];
-
-            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            if (depth == CV_8U && (src1_.cols & 3) == 0)
            {
                const int vcols = src1_.cols >> 2;

-                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                maxMat_v4(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }
-
-            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            else if (depth == CV_16U && (src1_.cols & 1) == 0)
            {
                const int vcols = src1_.cols >> 1;

-                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
-                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
-                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
-                       stream);
+                maxMat_v2(PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src1_.data, src1_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) src2_.data, src2_.step),
+                          PtrStepSz<unsigned int>(src1_.rows, vcols, (unsigned int*) dst_.data, dst_.step),
+                          stream);

                return;
            }