diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp
index 6e0471e9a..cd63c3ac9 100644
--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
@@ -357,6 +357,9 @@ namespace cv { namespace gpu { namespace device
         {
             return abs(x);
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
     {
@@ -364,6 +367,9 @@ namespace cv { namespace gpu { namespace device
         {
             return x;
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<signed char> : unary_function<signed char, signed char>
     {
@@ -371,6 +377,9 @@ namespace cv { namespace gpu { namespace device
         {
             return ::abs(x);
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<char> : unary_function<char, char>
     {
@@ -378,6 +387,9 @@ namespace cv { namespace gpu { namespace device
         {
             return ::abs(x);
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
     {
@@ -385,6 +397,9 @@ namespace cv { namespace gpu { namespace device
         {
             return x;
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<short> : unary_function<short, short>
     {
@@ -392,6 +407,9 @@ namespace cv { namespace gpu { namespace device
         {
             return ::abs(x);
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
     {
@@ -399,6 +417,9 @@ namespace cv { namespace gpu { namespace device
         {
             return x;
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<int> : unary_function<int, int>
     {
@@ -406,6 +427,9 @@ namespace cv { namespace gpu { namespace device
         {
             return ::abs(x);
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<float> : unary_function<float, float>
     {
@@ -413,6 +437,9 @@ namespace cv { namespace gpu { namespace device
         {
             return ::fabsf(x);
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
     template <> struct abs_func<double> : unary_function<double, double>
     {
@@ -420,6 +447,9 @@ namespace cv { namespace gpu { namespace device
         {
             return ::fabs(x);
         }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
     };
 
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
@@ -429,6 +459,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func ## f(v); \
         } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     }; \
     template <> struct name ## _func<double> : unary_function<double, double> \
     { \
@@ -436,6 +468,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func(v); \
         } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     };
 
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index c61601d4f..eaf577bac 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -42,405 +42,833 @@
 
 #if !defined CUDA_DISABLER
 
-#include "internal_shared.hpp"
+#include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 
-namespace cv { namespace gpu { namespace device
-{
-    //////////////////////////////////////////////////////////////////////////
-    // add
+using namespace cv::gpu;
+using namespace cv::gpu::device;
 
-    template <typename T, typename D> struct Add : binary_function<T, T, D>
+//////////////////////////////////////////////////////////////////////////
+// addMat
+
+namespace
+{
+    template <typename T, typename D> struct VAdd4;
+    template <> struct VAdd4<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<uint, uint>& other) {}
+    };
+    template <> struct VAdd4<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<int, uint>& other) {}
+    };
+    template <> struct VAdd4<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<uint, int>& other) {}
+    };
+    template <> struct VAdd4<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd4() {}
+        __device__ __forceinline__ VAdd4(const VAdd4<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct VAdd2;
+    template <> struct VAdd2<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<uint, uint>& other) {}
+    };
+    template <> struct VAdd2<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<uint, int>& other) {}
+    };
+    template <> struct VAdd2<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<int, uint>& other) {}
+    };
+    template <> struct VAdd2<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vadd2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vadd.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vadd.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAdd2() {}
+        __device__ __forceinline__ VAdd2(const VAdd2<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct AddMat : binary_function<T, T, D>
     {
         __device__ __forceinline__ D operator ()(T a, T b) const
         {
             return saturate_cast<D>(a + b);
         }
+
+        __device__ __forceinline__ AddMat() {}
+        __device__ __forceinline__ AddMat(const AddMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename D> struct TransformFunctorTraits< VAdd4<T, D> > : DefaultTransformFunctorTraits< VAdd4<T, D> >
+    {
+        enum { smart_shift = 2 };
     };
 
-    template <> struct TransformFunctorTraits< Add<ushort, ushort> > : DefaultTransformFunctorTraits< Add<ushort, ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< VAdd2<T, D> > : DefaultTransformFunctorTraits< VAdd4<T, D> >
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Add<short, short> > : DefaultTransformFunctorTraits< Add<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Add<int, int> > : DefaultTransformFunctorTraits< Add<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Add<float, float> > : DefaultTransformFunctorTraits< Add<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        enum { smart_shift = 2 };
     };
 
-    template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+    ////////////////////////////////////
+
+    template <> struct TransformFunctorTraits< AddMat<ushort, ushort> > : DefaultTransformFunctorTraits< AddMat<ushort, ushort> >
     {
-        if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), SingleMask(mask), stream);
-        else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Add<T, D>(), WithOutMask(), stream);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddMat<short, short> > : DefaultTransformFunctorTraits< AddMat<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddMat<int, int> > : DefaultTransformFunctorTraits< AddMat<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AddMat<float, float> > : DefaultTransformFunctorTraits< AddMat<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename D>
+    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd4<T, D>(), WithOutMask(), stream);
     }
 
-    template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void vadd4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void add_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    template <typename T, typename D> struct AddScalar : unary_function<T, D>
+    template <typename T, typename D>
+    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        AddScalar(double val_) : val(val_) {}
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VAdd2<T, D>(), WithOutMask(), stream);
+    }
+
+    template void vadd2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vadd2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, AddMat<T, D>(), WithOutMask(), stream);
+    }
+
+    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void addMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// addScalar
+
+namespace
+{
+    template <typename T, typename S, typename D> struct AddScalar : unary_function<T, D>
+    {
+        S val;
+
+        explicit AddScalar(S val_) : val(val_) {}
+
         __device__ __forceinline__ D operator ()(T a) const
         {
             return saturate_cast<D>(a + val);
         }
-        const double val;
     };
+}
 
-    template <> struct TransformFunctorTraits< AddScalar<ushort, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, ushort>  >
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< AddScalar<ushort, float, ushort> > : DefaultTransformFunctorTraits< AddScalar<ushort, float, ushort>  >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< AddScalar<short, short> > : DefaultTransformFunctorTraits< AddScalar<short, short> >
+    template <> struct TransformFunctorTraits< AddScalar<short, float, short> > : DefaultTransformFunctorTraits< AddScalar<short, float, short> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< AddScalar<int, int> > : DefaultTransformFunctorTraits< AddScalar<int, int> >
+    template <> struct TransformFunctorTraits< AddScalar<int, float, int> > : DefaultTransformFunctorTraits< AddScalar<int, float, int> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< AddScalar<float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float> >
+    template <> struct TransformFunctorTraits< AddScalar<float, float, float> > : DefaultTransformFunctorTraits< AddScalar<float, float, float> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
+}}}
 
-    template <typename T, typename D> void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        AddScalar<T, D> op(val);
+        AddScalar<T, S, D> op(static_cast<S>(val));
+
         if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream);
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
         else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void add_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void addScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void add_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void add_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void add_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // subtract
+//////////////////////////////////////////////////////////////////////////
+// subMat
 
-    template <typename T, typename D> struct Subtract : binary_function<T, T, D>
+namespace
+{
+    template <typename T, typename D> struct VSub4;
+    template <> struct VSub4<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<uint, uint>& other) {}
+    };
+    template <> struct VSub4<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<int, uint>& other) {}
+    };
+    template <> struct VSub4<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<uint, int>& other) {}
+    };
+    template <> struct VSub4<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub4() {}
+        __device__ __forceinline__ VSub4(const VSub4<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct VSub2;
+    template <> struct VSub2<uint, uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<uint, uint>& other) {}
+    };
+    template <> struct VSub2<uint, int> : binary_function<uint, uint, int>
+    {
+        __device__ __forceinline__ int operator ()(uint a, uint b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.s32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<uint, int>& other) {}
+    };
+    template <> struct VSub2<int, uint> : binary_function<int, int, uint>
+    {
+        __device__ __forceinline__ uint operator ()(int a, int b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.u32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.u32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.u32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<int, uint>& other) {}
+    };
+    template <> struct VSub2<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vsub2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vsub.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vsub.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VSub2() {}
+        __device__ __forceinline__ VSub2(const VSub2<int, int>& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct SubMat : binary_function<T, T, D>
     {
         __device__ __forceinline__ D operator ()(T a, T b) const
         {
             return saturate_cast<D>(a - b);
         }
+
+        __device__ __forceinline__ SubMat() {}
+        __device__ __forceinline__ SubMat(const SubMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename D> struct TransformFunctorTraits< VSub4<T, D> > : DefaultTransformFunctorTraits< VSub4<T, D> >
+    {
+        enum { smart_shift = 2 };
     };
 
-    template <> struct TransformFunctorTraits< Subtract<ushort, ushort> > : DefaultTransformFunctorTraits< Subtract<ushort, ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< VSub2<T, D> > : DefaultTransformFunctorTraits< VSub2<T, D> >
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Subtract<short, short> > : DefaultTransformFunctorTraits< Subtract<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Subtract<int, int> > : DefaultTransformFunctorTraits< Subtract<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Subtract<float, float> > : DefaultTransformFunctorTraits< Subtract<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        enum { smart_shift = 2 };
     };
 
-    template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
+    ////////////////////////////////////
+
+    template <> struct TransformFunctorTraits< SubMat<ushort, ushort> > : DefaultTransformFunctorTraits< SubMat<ushort, ushort> >
     {
-        if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), SingleMask(mask), stream);
-        else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, Subtract<T, D>(), WithOutMask(), stream);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< SubMat<short, short> > : DefaultTransformFunctorTraits< SubMat<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< SubMat<int, int> > : DefaultTransformFunctorTraits< SubMat<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< SubMat<float, float> > : DefaultTransformFunctorTraits< SubMat<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename D>
+    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub4<T, D>(), WithOutMask(), stream);
     }
 
-    template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void vsub4<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub4<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub4<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub4<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, short>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, int>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, float>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-
-    template <typename T, typename D> struct SubtractScalar : unary_function<T, D>
+    template <typename T, typename D>
+    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        SubtractScalar(double val_) : val(val_) {}
-        __device__ __forceinline__ D operator ()(T a) const
-        {
-            return saturate_cast<D>(a - val);
-        }
-        const double val;
-    };
-
-    template <> struct TransformFunctorTraits< SubtractScalar<ushort, ushort> > : DefaultTransformFunctorTraits< SubtractScalar<ushort, ushort>  >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< SubtractScalar<short, short> > : DefaultTransformFunctorTraits< SubtractScalar<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< SubtractScalar<int, int> > : DefaultTransformFunctorTraits< SubtractScalar<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< SubtractScalar<float, float> > : DefaultTransformFunctorTraits< SubtractScalar<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    template <typename T, typename D> void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream)
-    {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        SubtractScalar<T, D> op(val);
-        if (mask.data)
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, SingleMask(mask), stream);
-        else
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, VSub2<T, D>(), WithOutMask(), stream);
     }
 
-    template void subtract_gpu<uchar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<uchar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void vsub2<uint, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub2<uint, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub2<int, uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vsub2<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void subtract_gpu<schar, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template <typename T, typename D>
+    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, SubMat<T, D>(), WithOutMask(), stream);
+    }
 
-    //template void subtract_gpu<ushort, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<ushort, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<uchar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<short, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    template void subMat<schar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<schar, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<int, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<int, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subMat<ushort, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<ushort, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<ushort, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<float, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<float, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subMat<short, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<short, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<short, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //template void subtract_gpu<double, uchar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, schar>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, short>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, int>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    //template void subtract_gpu<double, float>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-    template void subtract_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    //template void subMat<int, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<int, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<int, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    //////////////////////////////////////////////////////////////////////////
-    // multiply
+    //template void subMat<float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 
-    struct multiply_8uc4_32f : binary_function<uint, float, uint>
+    //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// subScalar
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        AddScalar<T, S, D> op(-static_cast<S>(val));
+
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void subScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// mulMat
+
+namespace
+{
+    struct Mul_8uc4_32f : binary_function<uint, float, uint>
     {
         __device__ __forceinline__ uint operator ()(uint a, float b) const
         {
@@ -453,301 +881,316 @@ namespace cv { namespace gpu { namespace device
 
             return res;
         }
+
+        __device__ __forceinline__ Mul_8uc4_32f() {}
+        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
     };
 
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_8uc4_32f)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
-    };
-
-    void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream)
-    {
-        cv::gpu::device::transform(static_cast< PtrStepSz<uint> >(src1), src2, static_cast< PtrStepSz<uint> >(dst), multiply_8uc4_32f(), WithOutMask(), stream);
-    }
-
-    struct multiply_16sc4_32f : binary_function<short4, float, short4>
+    struct Mul_16sc4_32f : binary_function<short4, float, short4>
     {
         __device__ __forceinline__ short4 operator ()(short4 a, float b) const
         {
             return make_short4(saturate_cast<short>(a.x * b), saturate_cast<short>(a.y * b),
                                saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
         }
+
+        __device__ __forceinline__ Mul_16sc4_32f() {}
+        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
     };
 
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(multiply_16sc4_32f)
+    template <typename T, typename D> struct Mul : binary_function<T, T, D>
     {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 8 };
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return saturate_cast<D>(a * b);
+        }
+
+        __device__ __forceinline__ Mul() {}
+        __device__ __forceinline__ Mul(const Mul& other) {}
     };
 
-    void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream)
+    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
     {
-        cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), multiply_16sc4_32f(), WithOutMask(), stream);
-    }
+        S scale;
+
+        explicit MulScale(S scale_) : scale(scale_) {}
 
-    template <typename T, typename D> struct Multiply : binary_function<T, T, D>
-    {
-        Multiply(float scale_) : scale(scale_) {}
         __device__ __forceinline__ D operator ()(T a, T b) const
         {
             return saturate_cast<D>(scale * a * b);
         }
-        const float scale;
-    };
-    template <typename T> struct Multiply<T, double> : binary_function<T, T, double>
-    {
-        Multiply(double scale_) : scale(scale_) {}
-        __device__ __forceinline__ double operator ()(T a, T b) const
-        {
-            return scale * a * b;
-        }
-        const double scale;
-    };
-    template <> struct Multiply<int, int> : binary_function<int, int, int>
-    {
-        Multiply(double scale_) : scale(scale_) {}
-        __device__ __forceinline__ int operator ()(int a, int b) const
-        {
-            return saturate_cast<int>(scale * a * b);
-        }
-        const double scale;
     };
+}
 
-    template <> struct TransformFunctorTraits< Multiply<ushort, ushort> > : DefaultTransformFunctorTraits< Multiply<ushort, ushort> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Multiply<short, short> > : DefaultTransformFunctorTraits< Multiply<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Multiply<int, int> > : DefaultTransformFunctorTraits< Multiply<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Multiply<float, float> > : DefaultTransformFunctorTraits< Multiply<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    template <typename T, typename D> struct MultiplyCaller
-    {
-        static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-        {
-            Multiply<T, D> op(static_cast<float>(scale));
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
-        }
-    };
-    template <typename T> struct MultiplyCaller<T, double>
-    {
-        static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-            Multiply<T, double> op(scale);
-            cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<double>)dst, op, WithOutMask(), stream);
-        }
-    };
-    template <> struct MultiplyCaller<int, int>
-    {
-        static void call(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-            Multiply<int, int> op(scale);
-            cv::gpu::device::transform((PtrStepSz<int>)src1, (PtrStepSz<int>)src2, (PtrStepSz<int>)dst, op, WithOutMask(), stream);
-        }
-    };
-
-    template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-    {
-        MultiplyCaller<T, D>::call(src1, src2, dst, scale, stream);
-    }
-
-    template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    template <typename T, typename D> struct MultiplyScalar : unary_function<T, D>
-    {
-        MultiplyScalar(double val_, double scale_) : val(val_), scale(scale_) {}
-        __device__ __forceinline__ D operator ()(T a) const
-        {
-            return saturate_cast<D>(scale * a * val);
-        }
-        const double val;
-        const double scale;
-    };
-
-    template <> struct TransformFunctorTraits< MultiplyScalar<ushort, ushort> > : DefaultTransformFunctorTraits< MultiplyScalar<ushort, ushort> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< MultiplyScalar<short, short> > : DefaultTransformFunctorTraits< MultiplyScalar<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< MultiplyScalar<int, int> > : DefaultTransformFunctorTraits< MultiplyScalar<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< MultiplyScalar<float, float> > : DefaultTransformFunctorTraits< MultiplyScalar<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    template <typename T, typename D> void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-    {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-        MultiplyScalar<T, D> op(val, scale);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
-    }
-
-    template void multiply_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<float, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //template void multiply_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void multiply_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void multiply_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////////////////
-    // divide
-
-    struct divide_8uc4_32f : binary_function<uchar4, float, uchar4>
-    {
-        __device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const
-        {
-            return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),
-                                        saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b))
-                          : make_uchar4(0,0,0,0);
-        }
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_8uc4_32f)
+namespace cv { namespace gpu { namespace device
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Mul_8uc4_32f)
     {
         enum { smart_block_dim_x = 8 };
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 8 };
     };
 
-    void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream)
+    template <> struct TransformFunctorTraits< Mul<ushort, ushort> > : DefaultTransformFunctorTraits< Mul<ushort, ushort> >
     {
-        cv::gpu::device::transform(static_cast< PtrStepSz<uchar4> >(src1), src2, static_cast< PtrStepSz<uchar4> >(dst), divide_8uc4_32f(), WithOutMask(), stream);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Mul<short, short> > : DefaultTransformFunctorTraits< Mul<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Mul<int, int> > : DefaultTransformFunctorTraits< Mul<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Mul<float, float> > : DefaultTransformFunctorTraits< Mul<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct TransformFunctorTraits< MulScale<ushort, float, ushort> > : DefaultTransformFunctorTraits< MulScale<ushort, float, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MulScale<short, float, short> > : DefaultTransformFunctorTraits< MulScale<short, float, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MulScale<int, float, int> > : DefaultTransformFunctorTraits< MulScale<int, float, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MulScale<float, float, float> > : DefaultTransformFunctorTraits< MulScale<float, float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    void mulMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, Mul_8uc4_32f(), WithOutMask(), stream);
     }
 
+    void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, Mul_16sc4_32f(), WithOutMask(), stream);
+    }
 
-    struct divide_16sc4_32f : binary_function<short4, float, short4>
+    template <typename T, typename S, typename D>
+    void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
+    {
+        if (scale == 1)
+        {
+            Mul<T, D> op;
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            MulScale<T, S, D> op(static_cast<S>(scale));
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+    }
+
+    template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    template void mulMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+
+    //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// mulScalar
+
+namespace
+{
+    template <typename T, typename S, typename D> struct MulScalar : unary_function<T, D>
+    {
+        S val;
+
+        explicit MulScalar(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return saturate_cast<D>(a * val);
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< MulScalar<ushort, float, ushort> > : DefaultTransformFunctorTraits< MulScalar<ushort, float, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MulScalar<short, float, short> > : DefaultTransformFunctorTraits< MulScalar<short, float, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MulScalar<int, float, int> > : DefaultTransformFunctorTraits< MulScalar<int, float, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< MulScalar<float, float, float> > : DefaultTransformFunctorTraits< MulScalar<float, float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    {
+        MulScalar<T, S, D> op(static_cast<S>(val));
+        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    template void mulScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// divMat
+
+namespace
+{
+    struct Div_8uc4_32f : binary_function<uint, float, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, float b) const
+        {
+            uint res = 0;
+
+            if (b != 0)
+            {
+                b = 1.0f / b;
+                res |= (saturate_cast<uchar>((0xffu & (a      )) * b)      );
+                res |= (saturate_cast<uchar>((0xffu & (a >>  8)) * b) <<  8);
+                res |= (saturate_cast<uchar>((0xffu & (a >> 16)) * b) << 16);
+                res |= (saturate_cast<uchar>((0xffu & (a >> 24)) * b) << 24);
+            }
+
+            return res;
+        }
+    };
+
+    struct Div_16sc4_32f : binary_function<short4, float, short4>
     {
         __device__ __forceinline__ short4 operator ()(short4 a, float b) const
         {
@@ -757,425 +1200,905 @@ namespace cv { namespace gpu { namespace device
         }
     };
 
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(divide_16sc4_32f)
+    template <typename T, typename D> struct Div : binary_function<T, T, D>
+    {
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(a / b) : 0;
+        }
+
+        __device__ __forceinline__ Div() {}
+        __device__ __forceinline__ Div(const Div& other) {}
+    };
+    template <typename T> struct Div<T, float> : binary_function<T, T, float>
+    {
+        __device__ __forceinline__ float operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<float>(a) / b : 0;
+        }
+
+        __device__ __forceinline__ Div() {}
+        __device__ __forceinline__ Div(const Div& other) {}
+    };
+    template <typename T> struct Div<T, double> : binary_function<T, T, double>
+    {
+        __device__ __forceinline__ double operator ()(T a, T b) const
+        {
+            return b != 0 ? static_cast<double>(a) / b : 0;
+        }
+
+        __device__ __forceinline__ Div() {}
+        __device__ __forceinline__ Div(const Div& other) {}
+    };
+
+    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
+    {
+        S scale;
+
+        explicit DivScale(S scale_) : scale(scale_) {}
+
+        __device__ __forceinline__ D operator ()(T a, T b) const
+        {
+            return b != 0 ? saturate_cast<D>(scale * a / b) : 0;
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(Div_8uc4_32f)
     {
         enum { smart_block_dim_x = 8 };
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 8 };
     };
 
-    void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream)
+    template <> struct TransformFunctorTraits< Div<ushort, ushort> > : DefaultTransformFunctorTraits< Div<ushort, ushort> >
     {
-        cv::gpu::device::transform(static_cast< PtrStepSz<short4> >(src1), src2, static_cast< PtrStepSz<short4> >(dst), divide_16sc4_32f(), WithOutMask(), stream);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Div<short, short> > : DefaultTransformFunctorTraits< Div<short, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Div<int, int> > : DefaultTransformFunctorTraits< Div<int, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Div<float, float> > : DefaultTransformFunctorTraits< Div<float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct TransformFunctorTraits< DivScale<ushort, float, ushort> > : DefaultTransformFunctorTraits< DivScale<ushort, float, ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< DivScale<short, float, short> > : DefaultTransformFunctorTraits< DivScale<short, float, short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< DivScale<int, float, int> > : DefaultTransformFunctorTraits< DivScale<int, float, int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< DivScale<float, float, float> > : DefaultTransformFunctorTraits< DivScale<float, float, float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    void divMat_8uc4_32f(PtrStepSz<uint> src1, PtrStepSzf src2, PtrStepSz<uint> dst, cudaStream_t stream)
+    {
+        transform(src1, src2, dst, Div_8uc4_32f(), WithOutMask(), stream);
     }
 
-    template <typename T, typename D> struct Divide : binary_function<T, T, D>
+    void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream)
     {
-        Divide(double scale_) : scale(scale_) {}
-        __device__ __forceinline__ D operator ()(T a, T b) const
+        transform(src1, src2, dst, Div_16sc4_32f(), WithOutMask(), stream);
+    }
+
+    template <typename T, typename S, typename D>
+    void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream)
+    {
+        if (scale == 1)
         {
-            return b != 0 ? saturate_cast<D>(a * scale / b) : 0;
+            Div<T, D> op;
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            DivScale<T, S, D> op(static_cast<S>(scale));
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
         }
-        const double scale;
-    };
-
-    template <> struct TransformFunctorTraits< Divide<ushort, ushort> > : DefaultTransformFunctorTraits< Divide<ushort, ushort> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Divide<short, short> > : DefaultTransformFunctorTraits< Divide<short, short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Divide<int, int> > : DefaultTransformFunctorTraits< Divide<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Divide<float, float> > : DefaultTransformFunctorTraits< Divide<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream)
-    {
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-        Divide<T, D> op(scale);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
     }
 
-    template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<uchar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
-    //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<schar, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
-    //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divMat<ushort, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<ushort, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<ushort, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
-    //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divMat<short, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<short, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<short, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
-    //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<int, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<int, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
-    //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 
-    //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, schar >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, short >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, int   >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, float >(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<double, double>(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
 
-    template <typename T, typename D> struct DivideScalar : unary_function<T, D>
+//////////////////////////////////////////////////////////////////////////
+// divScalar
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
     {
-        DivideScalar(double val_, double scale_) : val(val_), scale(scale_) {}
+        MulScalar<T, S, D> op(static_cast<S>(1.0 / val));
+        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+    }
+
+    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+
+    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// divInv
+
+namespace
+{
+    template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
+    {
+        S val;
+
+        explicit DivInv(double val_) : val(val_) {}
+
         __device__ __forceinline__ D operator ()(T a) const
         {
-            return saturate_cast<D>(scale * a / val);
+            return a != 0 ? saturate_cast<D>(val / a) : 0;
         }
-        const double val;
-        const double scale;
     };
+}
 
-    template <> struct TransformFunctorTraits< DivideScalar<ushort, ushort> > : DefaultTransformFunctorTraits< DivideScalar<ushort, ushort> >
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< DivInv<ushort, float, ushort> > : DefaultTransformFunctorTraits< DivInv<ushort, float, ushort> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< DivideScalar<short, short> > : DefaultTransformFunctorTraits< DivideScalar<short, short> >
+    template <> struct TransformFunctorTraits< DivInv<short, float, short> > : DefaultTransformFunctorTraits< DivInv<short, float, short> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< DivideScalar<int, int> > : DefaultTransformFunctorTraits< DivideScalar<int, int> >
+    template <> struct TransformFunctorTraits< DivInv<int, float, int> > : DefaultTransformFunctorTraits< DivInv<int, float, int> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< DivideScalar<float, float> > : DefaultTransformFunctorTraits< DivideScalar<float, float> >
+    template <> struct TransformFunctorTraits< DivInv<float, float, float> > : DefaultTransformFunctorTraits< DivInv<float, float, float> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
+}}}
 
-    template <typename T, typename D> void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        cudaSafeCall( cudaSetDoubleForDevice(&scale) );
-        DivideScalar<T, D> op(val, scale);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        DivInv<T, S, D> op(static_cast<S>(val));
+        transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void divide_gpu<uchar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<uchar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<uchar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<schar, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<schar, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<ushort, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<ushort, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<ushort, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<short, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<short, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<short, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<int, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<int, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<int, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<float, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<float, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<float, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divide_gpu<double, uchar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, schar >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, ushort>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, short >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, int   >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    //template void divide_gpu<double, float >(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-    template void divide_gpu<double, double>(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    template <typename T, typename D> struct Reciprocal : unary_function<T, D>
+//////////////////////////////////////////////////////////////////////////
+// absDiffMat
+
+namespace
+{
+    template <typename T, typename D> struct VAbsDiff4;
+    template <> struct VAbsDiff4<uint, uint> : binary_function<uint, uint, uint>
     {
-        Reciprocal(double scale_) : scale(scale_) {}
-        __device__ __forceinline__ D operator ()(T a) const
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            return a != 0 ? saturate_cast<D>(scale / a) : 0;
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-        const double scale;
+
+        __device__ __forceinline__ VAbsDiff4() {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<uint, uint>& other) {}
+    };
+    template <> struct VAbsDiff4<int, int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff4.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.s32.s32.s32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAbsDiff4() {}
+        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4<int, int>& other) {}
     };
 
-    template <> struct TransformFunctorTraits< Reciprocal<ushort, ushort> > : DefaultTransformFunctorTraits< Reciprocal<ushort, ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct VAbsDiff2;
+    template <> struct VAbsDiff2<uint, uint> : binary_function<uint, uint, uint>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
+        {
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAbsDiff2() {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<uint, uint>& other) {}
     };
-    template <> struct TransformFunctorTraits< Reciprocal<short, short> > : DefaultTransformFunctorTraits< Reciprocal<short, short> >
+    template <> struct VAbsDiff2<int, int> : binary_function<int, int, int>
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Reciprocal<int, int> > : DefaultTransformFunctorTraits< Reciprocal<int, int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Reciprocal<float, float> > : DefaultTransformFunctorTraits< Reciprocal<float, float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vabsdiff2.s32.s32.s32.sat %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vabsdiff.s32.s32.s32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vabsdiff.s32.s32.s32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VAbsDiff2() {}
+        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2<int, int>& other) {}
     };
 
-    template <typename T, typename D> void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream)
+    ////////////////////////////////////
+
+    __device__ __forceinline__ int _abs(int a)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&scalar) );
-        Reciprocal<T, D> op(scalar);
-        cv::gpu::device::transform((PtrStepSz<T>)src2, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
+        return ::abs(a);
+    }
+    __device__ __forceinline__ float _abs(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ double _abs(double a)
+    {
+        return ::fabs(a);
     }
 
-    template void divide_gpu<uchar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<uchar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<uchar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<schar, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<schar, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<ushort, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<ushort, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<ushort, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<ushort, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<short, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<short, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<short, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<short, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<int, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<int, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<int, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<int, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<int, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<int, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<int, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<float, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<float, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<float, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<float, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //template void divide_gpu<double, uchar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, schar >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, ushort>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, short >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, int   >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    //template void divide_gpu<double, float >(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-    template void divide_gpu<double, double>(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////////////////
-    // absdiff
-
-    template <typename T> struct Absdiff : binary_function<T, T, T>
+    template <typename T> struct AbsDiffMat : binary_function<T, T, T>
     {
-        static __device__ __forceinline__ int abs(int a)
-        {
-            return ::abs(a);
-        }
-        static __device__ __forceinline__ float abs(float a)
-        {
-            return ::fabsf(a);
-        }
-        static __device__ __forceinline__ double abs(double a)
-        {
-            return ::fabs(a);
-        }
-
         __device__ __forceinline__ T operator ()(T a, T b) const
         {
-            return saturate_cast<T>(::abs(a - b));
+            return saturate_cast<T>(_abs(a - b));
         }
+
+        __device__ __forceinline__ AbsDiffMat() {}
+        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename D> struct TransformFunctorTraits< VAbsDiff4<T, D> > : DefaultTransformFunctorTraits< VAbsDiff4<T, D> >
+    {
+        enum { smart_shift = 2 };
     };
 
-    template <> struct TransformFunctorTraits< Absdiff<ushort> > : DefaultTransformFunctorTraits< Absdiff<ushort> >
+    ////////////////////////////////////
+
+    template <typename T, typename D> struct TransformFunctorTraits< VAbsDiff2<T, D> > : DefaultTransformFunctorTraits< VAbsDiff4<T, D> >
     {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Absdiff<short> > : DefaultTransformFunctorTraits< Absdiff<short> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Absdiff<int> > : DefaultTransformFunctorTraits< Absdiff<int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Absdiff<float> > : DefaultTransformFunctorTraits< Absdiff<float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+        enum { smart_shift = 2 };
     };
 
-    template <typename T> void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    ////////////////////////////////////
+
+    template <> struct TransformFunctorTraits< AbsDiffMat<ushort> > : DefaultTransformFunctorTraits< AbsDiffMat<ushort> >
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, Absdiff<T>(), WithOutMask(), stream);
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AbsDiffMat<short> > : DefaultTransformFunctorTraits< AbsDiffMat<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AbsDiffMat<int> > : DefaultTransformFunctorTraits< AbsDiffMat<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< AbsDiffMat<float> > : DefaultTransformFunctorTraits< AbsDiffMat<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff4<T, T>(), WithOutMask(), stream);
     }
 
-    //template void absdiff_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vabsDiff4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vabsDiff4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T> struct AbsdiffScalar : unary_function<T, T>
+    template <typename T>
+    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        AbsdiffScalar(double val_) : val(val_) {}
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VAbsDiff2<T, T>(), WithOutMask(), stream);
+    }
+
+    template void vabsDiff2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vabsDiff2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T>
+    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, AbsDiffMat<T>(), WithOutMask(), stream);
+    }
+
+    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// absDiffScalar
+
+namespace
+{
+    template <typename T, typename S> struct AbsDiffScalar : unary_function<T, T>
+    {
+        S val;
+
+        explicit AbsDiffScalar(S val_) : val(val_) {}
+
         __device__ __forceinline__ T operator ()(T a) const
         {
-            return saturate_cast<T>(::fabs(a - val));
+            abs_func<S> f;
+            return saturate_cast<T>(f(a - val));
         }
-        double val;
     };
+}
 
-    template <> struct TransformFunctorTraits< AbsdiffScalar<ushort> > : DefaultTransformFunctorTraits< AbsdiffScalar<ushort> >
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< AbsDiffScalar<ushort, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<ushort, float>  >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< AbsdiffScalar<short> > : DefaultTransformFunctorTraits< AbsdiffScalar<short> >
+    template <> struct TransformFunctorTraits< AbsDiffScalar<short, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<short, float> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< AbsdiffScalar<int> > : DefaultTransformFunctorTraits< AbsdiffScalar<int> >
+    template <> struct TransformFunctorTraits< AbsDiffScalar<int, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<int, float> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
-    template <> struct TransformFunctorTraits< AbsdiffScalar<float> > : DefaultTransformFunctorTraits< AbsdiffScalar<float> >
+    template <> struct TransformFunctorTraits< AbsDiffScalar<float, float> > : DefaultTransformFunctorTraits< AbsDiffScalar<float, float> >
     {
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
+}}}
 
-    template <typename T> void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+    template <typename T, typename S>
+    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
     {
-        cudaSafeCall( cudaSetDoubleForDevice(&val) );
-        AbsdiffScalar<T> op(val);
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)dst, op, WithOutMask(), stream);
+        AbsDiffScalar<T, S> op(static_cast<S>(val));
+
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, op, WithOutMask(), stream);
     }
 
-    //template void absdiff_gpu<uchar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<schar >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<ushort>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<short >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<int   >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    //template void absdiff_gpu<float >(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
-    template void absdiff_gpu<double>(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////////////////
-    // Compare
+//////////////////////////////////////////////////////////////////////////
+// absMat
 
-    template <template <typename> class Op, typename T>
-    struct Compare: binary_function<T, T, uchar>
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< abs_func<ushort> > : DefaultTransformFunctorTraits< abs_func<ushort> >
     {
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< abs_func<short> > : DefaultTransformFunctorTraits< abs_func<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< abs_func<int> > : DefaultTransformFunctorTraits< abs_func<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< abs_func<float> > : DefaultTransformFunctorTraits< abs_func<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
+    }
+
+    template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrMat
+
+namespace
+{
+    template <typename T> struct Sqr : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            return saturate_cast<T>(x * x);
+        }
+
+        __device__ __forceinline__ Sqr() {}
+        __device__ __forceinline__ Sqr(const Sqr& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< Sqr<ushort> > : DefaultTransformFunctorTraits< Sqr<ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Sqr<short> > : DefaultTransformFunctorTraits< Sqr<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Sqr<int> > : DefaultTransformFunctorTraits< Sqr<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Sqr<float> > : DefaultTransformFunctorTraits< Sqr<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
+    }
+
+    template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// sqrtMat
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< sqrt_func<uchar> > : DefaultTransformFunctorTraits< sqrt_func<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< sqrt_func<schar> > : DefaultTransformFunctorTraits< sqrt_func<schar> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< sqrt_func<ushort> > : DefaultTransformFunctorTraits< sqrt_func<ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< sqrt_func<short> > : DefaultTransformFunctorTraits< sqrt_func<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< sqrt_func<int> > : DefaultTransformFunctorTraits< sqrt_func<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< sqrt_func<float> > : DefaultTransformFunctorTraits< sqrt_func<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
+    }
+
+    template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// logMat
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< log_func<uchar> > : DefaultTransformFunctorTraits< log_func<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< log_func<schar> > : DefaultTransformFunctorTraits< log_func<schar> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< log_func<ushort> > : DefaultTransformFunctorTraits< log_func<ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< log_func<short> > : DefaultTransformFunctorTraits< log_func<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< log_func<int> > : DefaultTransformFunctorTraits< log_func<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< log_func<float> > : DefaultTransformFunctorTraits< log_func<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
+    }
+
+    template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// expMat
+
+namespace
+{
+    template <typename T> struct Exp : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(T x) const
+        {
+            exp_func<T> f;
+            return saturate_cast<T>(f(x));
+        }
+
+        __device__ __forceinline__ Exp() {}
+        __device__ __forceinline__ Exp(const Exp& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< Exp<ushort> > : DefaultTransformFunctorTraits< Exp<ushort> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Exp<short> > : DefaultTransformFunctorTraits< Exp<short> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Exp<int> > : DefaultTransformFunctorTraits< Exp<int> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< Exp<float> > : DefaultTransformFunctorTraits< Exp<float> >
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T>
+    void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
+    }
+
+    template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpMat
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    struct Cmp: binary_function<T, T, uchar>
+    {
+        __device__ __forceinline__ uchar operator()(T a, T b) const
         {
             Op<T> op;
-            return static_cast<uchar>(static_cast<int>(op(src1, src2)) * 255);
+            return -op(a, b);
         }
     };
+}
 
-#define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
-    template <> struct TransformFunctorTraits< Compare<op, type> > : DefaultTransformFunctorTraits< Compare<op, type> > \
-    { \
-        enum { smart_block_dim_y = block_dim_y }; \
-        enum { smart_shift = shift }; \
-    };
+namespace cv { namespace gpu { namespace device
+{
+    #define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
+        template <> struct TransformFunctorTraits< Cmp<op, type> > : DefaultTransformFunctorTraits< Cmp<op, type> > \
+        { \
+            enum { smart_block_dim_y = block_dim_y }; \
+            enum { smart_shift = shift }; \
+        };
 
     IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, int, 8, 4)
     IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, float, 8, 4)
@@ -1190,132 +2113,136 @@ namespace cv { namespace gpu { namespace device
     IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, int, 8, 4)
     IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4)
 
-#undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
+    #undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
+}}}
 
-    template <template <typename> class Op, typename T> void compare(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+    template <template <typename> class Op, typename T>
+    void cmpMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        Compare<Op, T> op;
-        cv::gpu::device::transform(static_cast< PtrStepSz<T> >(src1), static_cast< PtrStepSz<T> >(src2), dst, op, WithOutMask(), stream);
+        Cmp<Op, T> op;
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, dst, op, WithOutMask(), stream);
     }
 
-    template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<equal_to, T>(src1, src2, dst, stream);
+        cmpMat<equal_to, T>(src1, src2, dst, stream);
     }
-    template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<not_equal_to, T>(src1, src2, dst, stream);
+        cmpMat<not_equal_to, T>(src1, src2, dst, stream);
     }
-    template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<less, T>(src1, src2, dst, stream);
+        cmpMat<less, T>(src1, src2, dst, stream);
     }
-    template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        compare<less_equal, T>(src1, src2, dst, stream);
+        cmpMat<less_equal, T>(src1, src2, dst, stream);
     }
 
-    template void compare_eq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_ne<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_lt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_le<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpScalar
+
+namespace
+{
 #define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
 
-    template <template <typename> class Op, typename T, int cn> struct CompareScalar;
+    template <template <typename> class Op, typename T, int cn> struct CmpScalar;
     template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 1>: unary_function<T, uchar>
+    struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
     {
         const T val;
 
-        __host__ explicit CompareScalar(T val_) : val(val_) {}
+        __host__ explicit CmpScalar(T val_) : val(val_) {}
 
         __device__ __forceinline__ uchar operator()(T src) const
         {
-            Op<T> op;
-            return static_cast<uchar>(static_cast<int>(op(src, val)) * 255);
+            Cmp<Op, T> op;
+            return op(src, val);
         }
     };
     template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 2>: unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
+    struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
     {
         const TYPE_VEC(T, 2) val;
 
-        __host__ explicit CompareScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
+        __host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
 
         __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
         {
-            Op<T> op;
-            return VecTraits<TYPE_VEC(uchar, 2)>::make(
-                        static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255));
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
         }
     };
     template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 3>: unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
+    struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
     {
         const TYPE_VEC(T, 3) val;
 
-        __host__ explicit CompareScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
+        __host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
 
         __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
         {
-            Op<T> op;
-            return VecTraits<TYPE_VEC(uchar, 3)>::make(
-                        static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255));
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
         }
     };
     template <template <typename> class Op, typename T>
-    struct CompareScalar<Op, T, 4>: unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
+    struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
     {
         const TYPE_VEC(T, 4) val;
 
-        __host__ explicit CompareScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
+        __host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
 
         __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
         {
-            Op<T> op;
-            return VecTraits<TYPE_VEC(uchar, 4)>::make(
-                        static_cast<uchar>(static_cast<int>(op(src.x, val.x)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.y, val.y)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.z, val.z)) * 255),
-                        static_cast<uchar>(static_cast<int>(op(src.w, val.w)) * 255));
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
         }
     };
 
 #undef TYPE_VEC
+}
 
+namespace cv { namespace gpu { namespace device
+{
 #define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
-    template <> struct TransformFunctorTraits< CompareScalar<op, type, 1> > : DefaultTransformFunctorTraits< CompareScalar<op, type, 1> > \
+    template <> struct TransformFunctorTraits< CmpScalar<op, type, 1> > : DefaultTransformFunctorTraits< CmpScalar<op, type, 1> > \
     { \
         enum { smart_block_dim_y = block_dim_y }; \
         enum { smart_shift = shift }; \
@@ -1335,8 +2262,12 @@ namespace cv { namespace gpu { namespace device
     IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4)
 
 #undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
+}}}
 
-    template <template <typename> class Op, typename T, int cn> void compare(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
+namespace arithm
+{
+    template <template <typename> class Op, typename T, int cn>
+    void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef typename TypeVec<T, cn>::vec_type src_t;
         typedef typename TypeVec<uchar, cn>::vec_type dst_t;
@@ -1344,505 +2275,721 @@ namespace cv { namespace gpu { namespace device
         T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
         src_t val1 = VecTraits<src_t>::make(sval);
 
-        CompareScalar<Op, T, cn> op(val1);
-
-        cv::gpu::device::transform(static_cast< PtrStepSz<src_t> >(src), static_cast< PtrStepSz<dst_t> >(dst), op, WithOutMask(), stream);
+        CmpScalar<Op, T, cn> op(val1);
+        transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
     }
 
-    template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<equal_to, T, 1>,
-            compare<equal_to, T, 2>,
-            compare<equal_to, T, 3>,
-            compare<equal_to, T, 4>
+            cmpScalar<equal_to, T, 1>,
+            cmpScalar<equal_to, T, 2>,
+            cmpScalar<equal_to, T, 3>,
+            cmpScalar<equal_to, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<not_equal_to, T, 1>,
-            compare<not_equal_to, T, 2>,
-            compare<not_equal_to, T, 3>,
-            compare<not_equal_to, T, 4>
+            cmpScalar<not_equal_to, T, 1>,
+            cmpScalar<not_equal_to, T, 2>,
+            cmpScalar<not_equal_to, T, 3>,
+            cmpScalar<not_equal_to, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<less, T, 1>,
-            compare<less, T, 2>,
-            compare<less, T, 3>,
-            compare<less, T, 4>
+            cmpScalar<less, T, 1>,
+            cmpScalar<less, T, 2>,
+            cmpScalar<less, T, 3>,
+            cmpScalar<less, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<less_equal, T, 1>,
-            compare<less_equal, T, 2>,
-            compare<less_equal, T, 3>,
-            compare<less_equal, T, 4>
+            cmpScalar<less_equal, T, 1>,
+            cmpScalar<less_equal, T, 2>,
+            cmpScalar<less_equal, T, 3>,
+            cmpScalar<less_equal, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<greater, T, 1>,
-            compare<greater, T, 2>,
-            compare<greater, T, 3>,
-            compare<greater, T, 4>
+            cmpScalar<greater, T, 1>,
+            cmpScalar<greater, T, 2>,
+            cmpScalar<greater, T, 3>,
+            cmpScalar<greater, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
-    template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
     {
         typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
         static const func_t funcs[] =
         {
             0,
-            compare<greater_equal, T, 1>,
-            compare<greater_equal, T, 2>,
-            compare<greater_equal, T, 3>,
-            compare<greater_equal, T, 4>
+            cmpScalar<greater_equal, T, 1>,
+            cmpScalar<greater_equal, T, 2>,
+            cmpScalar<greater_equal, T, 3>,
+            cmpScalar<greater_equal, T, 4>
         };
 
         funcs[cn](src, val, dst, stream);
     }
 
-    template void compare_eq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_eq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_ne<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ne<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_lt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_lt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_le<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_le<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_gt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_gt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 
-    template void compare_ge<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template void compare_ge<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // Unary bitwise logical matrix operations
+//////////////////////////////////////////////////////////////////////////////////////
+// bitMat
 
-    enum { UN_OP_NOT };
-
-    template <typename T, int opid>
-    struct UnOp;
-
-    template <typename T>
-    struct UnOp<T, UN_OP_NOT>
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< bit_not<uchar> > : DefaultTransformFunctorTraits< bit_not<uchar> >
     {
-        static __device__ __forceinline__ T call(T v) { return ~v; }
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_not<ushort> > : DefaultTransformFunctorTraits< bit_not<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_not<uint> > : DefaultTransformFunctorTraits< bit_not<uint> >
+    {
+        enum { smart_shift = 2 };
     };
 
-
-    template <int opid>
-    __global__ void bitwiseUnOpKernel(int rows, int width, const PtrStepb src, PtrStepb dst)
+    template <> struct TransformFunctorTraits< bit_and<uchar> > : DefaultTransformFunctorTraits< bit_and<uchar> >
     {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_and<ushort> > : DefaultTransformFunctorTraits< bit_and<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_and<uint> > : DefaultTransformFunctorTraits< bit_and<uint> >
+    {
+        enum { smart_shift = 2 };
+    };
 
-        if (y < rows)
+    template <> struct TransformFunctorTraits< bit_or<uchar> > : DefaultTransformFunctorTraits< bit_or<uchar> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_or<ushort> > : DefaultTransformFunctorTraits< bit_or<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_or<uint> > : DefaultTransformFunctorTraits< bit_or<uint> >
+    {
+        enum { smart_shift = 2 };
+    };
+
+    template <> struct TransformFunctorTraits< bit_xor<uchar> > : DefaultTransformFunctorTraits< bit_xor<uchar> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_xor<ushort> > : DefaultTransformFunctorTraits< bit_xor<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< bit_xor<uint> > : DefaultTransformFunctorTraits< bit_xor<uint> >
+    {
+        enum { smart_shift = 2 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, bit_not<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_and<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_or<T>(), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream)
+    {
+        if (mask.data)
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), mask, stream);
+        else
+            transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, bit_xor<T>(), WithOutMask(), stream);
+    }
+
+    template void bitMatNot<uchar>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatNot<ushort>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatNot<uint>(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatAnd<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatAnd<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatAnd<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatOr<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatOr<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatOr<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+
+    template void bitMatXor<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatXor<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template void bitMatXor<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////////////////
+// bitScalar
+
+namespace cv { namespace gpu { namespace device
+{
+    template <> struct TransformFunctorTraits< binder2nd< bit_and<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<uchar> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< bit_and<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<ushort> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< bit_and<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_and<uint> > >
+    {
+        enum { smart_shift = 2 };
+    };
+
+    template <> struct TransformFunctorTraits< binder2nd< bit_or<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<uchar> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< bit_or<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<ushort> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< bit_or<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_or<uint> > >
+    {
+        enum { smart_shift = 2 };
+    };
+
+    template <> struct TransformFunctorTraits< binder2nd< bit_xor<uchar> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<uchar> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< bit_xor<ushort> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<ushort> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< bit_xor<uint> > > : DefaultTransformFunctorTraits< binder2nd< bit_xor<uint> > >
+    {
+        enum { smart_shift = 2 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void bitScalarAnd(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_and<T>(), src2), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitScalarOr(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_or<T>(), src2), WithOutMask(), stream);
+    }
+
+    template <typename T> void bitScalarXor(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(bit_xor<T>(), src2), WithOutMask(), stream);
+    }
+
+    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarAnd<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarOr<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+    template void bitScalarXor<uint>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+}
+
+//////////////////////////////////////////////////////////////////////////
+// min
+
+namespace
+{
+    template <typename T> struct VMin4;
+    template <> struct VMin4<uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src_ptr = src.ptr(y) + x;
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = UnOp<uint, opid>::call(*(uint*)src_ptr);
-            }
-            else
-            {
-                const uchar* src_end = src.ptr(y) + width;
-                while (src_ptr < src_end)
-                {
-                    *dst_ptr++ = UnOp<uchar, opid>::call(*src_ptr++);
-                }
-            }
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-    }
 
-
-    template <int opid>
-    void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst,
-                     cudaStream_t stream)
+        __device__ __forceinline__ VMin4() {}
+        __device__ __forceinline__ VMin4(const VMin4& other) {}
+    };
+    template <> struct VMin4<int> : binary_function<int, int, int>
     {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)),
-                  divUp(rows, threads.y));
-
-        bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src,
-                                      const PtrStepb mask, PtrStepb dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn])
+        __device__ __forceinline__ int operator ()(int a, int b) const
         {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src_row = (const T*)src.ptr(y);
+            int res = 0;
 
-            dst_row[x] = UnOp<T, opid>::call(src_row[x]);
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-    }
 
-
-    template <typename T, int opid>
-    void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src,
-                     const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn,
-                          const PtrStepb src, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src,
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
-    }
-
-    template void bitwiseMaskNotCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskNotCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskNotCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-
-
-    //////////////////////////////////////////////////////////////////////////
-    // Binary bitwise logical matrix operations
-
-    enum { BIN_OP_OR, BIN_OP_AND, BIN_OP_XOR };
-
-    template <typename T, int opid>
-    struct BinOp;
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_OR>
-    {
-        static __device__ __forceinline__ T call(T a, T b) { return a | b; }
+        __device__ __forceinline__ VMin4() {}
+        __device__ __forceinline__ VMin4(const VMin4& other) {}
     };
 
+    ////////////////////////////////////
 
-    template <typename T>
-    struct BinOp<T, BIN_OP_AND>
+    template <typename T> struct VMin2;
+    template <> struct VMin2<uint> : binary_function<uint, uint, uint>
     {
-        static __device__ __forceinline__ T call(T a, T b) { return a & b; }
-    };
-
-    template <typename T>
-    struct BinOp<T, BIN_OP_XOR>
-    {
-        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; }
-    };
-
-
-    template <int opid>
-    __global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1,
-                                       const PtrStepb src2, PtrStepb dst)
-    {
-        const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows)
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            uchar* dst_ptr = dst.ptr(y) + x;
-            const uchar* src1_ptr = src1.ptr(y) + x;
-            const uchar* src2_ptr = src2.ptr(y) + x;
+            uint res = 0;
 
-            if (x + sizeof(uint) - 1 < width)
-            {
-                *(uint*)dst_ptr = BinOp<uint, opid>::call(*(uint*)src1_ptr, *(uint*)src2_ptr);
-            }
-            else
-            {
-                const uchar* src1_end = src1.ptr(y) + width;
-                while (src1_ptr < src1_end)
-                {
-                    *dst_ptr++ = BinOp<uchar, opid>::call(*src1_ptr++, *src2_ptr++);
-                }
-            }
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
-    }
 
-
-    template <int opid>
-    void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2,
-                      PtrStepb dst, cudaStream_t stream)
+        __device__ __forceinline__ VMin2() {}
+        __device__ __forceinline__ VMin2(const VMin2& other) {}
+    };
+    template <> struct VMin2<int> : binary_function<int, int, int>
     {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
-
-        bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-
-    template <typename T, int opid>
-    __global__ void bitwiseBinOpKernel(
-            int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-            const PtrStepb mask, PtrStepb dst)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < cols && y < rows && mask.ptr(y)[x / cn])
+        __device__ __forceinline__ int operator ()(int a, int b) const
         {
-            T* dst_row = (T*)dst.ptr(y);
-            const T* src1_row = (const T*)src1.ptr(y);
-            const T* src2_row = (const T*)src2.ptr(y);
+            int res = 0;
 
-            dst_row[x] = BinOp<T, opid>::call(src1_row[x], src2_row[x]);
+        #if __CUDA_ARCH__ >= 300
+            asm("vmin2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmin.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmin.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
         }
+
+        __device__ __forceinline__ VMin2() {}
+        __device__ __forceinline__ VMin2(const VMin2& other) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< VMin4<T> > : DefaultTransformFunctorTraits< VMin4<T> >
+    {
+                enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+
+    ////////////////////////////////////
+
+    template <typename T> struct TransformFunctorTraits< VMin2<T> > : DefaultTransformFunctorTraits< VMin2<T> >
+    {
+                enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+
+    ////////////////////////////////////
+
+    template <> struct TransformFunctorTraits< minimum<ushort> > : DefaultTransformFunctorTraits< minimum<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< minimum<short> > : DefaultTransformFunctorTraits< minimum<short> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< minimum<int> > : DefaultTransformFunctorTraits< minimum<int> >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< minimum<float> > : DefaultTransformFunctorTraits< minimum<float> >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct TransformFunctorTraits< binder2nd< minimum<ushort> > > : DefaultTransformFunctorTraits< binder2nd< minimum<ushort> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< minimum<short> > > : DefaultTransformFunctorTraits< binder2nd< minimum<short> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< minimum<int> > > : DefaultTransformFunctorTraits< binder2nd< minimum<int> > >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< minimum<float> > > : DefaultTransformFunctorTraits< binder2nd< minimum<float> > >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin4<T>(), WithOutMask(), stream);
     }
 
-
-    template <typename T, int opid>
-    void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                        const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-        bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMin2<T>(), WithOutMask(), stream);
     }
 
-
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
-                         const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, minimum<T>(), WithOutMask(), stream);
     }
 
+    template void vmin4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmin4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T>
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                             const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
+    template void vmin2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmin2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
     }
 
-    template void bitwiseMaskOrCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskOrCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
+    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
+//////////////////////////////////////////////////////////////////////////
+// max
 
-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
-                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
+namespace
+{
+    template <typename T> struct VMax4;
+    template <> struct VMax4<uint> : binary_function<uint, uint, uint>
     {
-        bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwiseMaskAndCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskAndCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-
-
-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
-                          const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
-    }
-
-
-    template <typename T>
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
-                              const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
-    {
-        bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
-    }
-
-    template void bitwiseMaskXorCaller<uchar>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-    template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-
-    //////////////////////////////////////////////////////////////////////////
-    // min/max
-
-    namespace detail
-    {
-        template <size_t size, typename F> struct MinMaxTraits : DefaultTransformFunctorTraits<F>
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-        };
-        template <typename F> struct MinMaxTraits<2, F> : DefaultTransformFunctorTraits<F>
+            uint res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax4() {}
+        __device__ __forceinline__ VMax4(const VMax4& other) {}
+    };
+    template <> struct VMax4<int> : binary_function<int, int, int>
+    {
+        __device__ __forceinline__ int operator ()(int a, int b) const
         {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct MinMaxTraits<4, F> : DefaultTransformFunctorTraits<F>
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.s32.s32.s32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax4() {}
+        __device__ __forceinline__ VMax4(const VMax4& other) {}
+    };
+
+    ////////////////////////////////////
+
+    template <typename T> struct VMax2;
+    template <> struct VMax2<uint> : binary_function<uint, uint, uint>
+    {
+        __device__ __forceinline__ uint operator ()(uint a, uint b) const
         {
-            enum { smart_block_dim_y = 4 };
-            enum { smart_shift = 4 };
-        };
-    }
+            uint res = 0;
 
-    template <typename T> struct TransformFunctorTraits< minimum<T> > : detail::MinMaxTraits< sizeof(T), minimum<T> >
-    {
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax2() {}
+        __device__ __forceinline__ VMax2(const VMax2& other) {}
     };
-    template <typename T> struct TransformFunctorTraits< maximum<T> > : detail::MinMaxTraits< sizeof(T), maximum<T> >
+    template <> struct VMax2<int> : binary_function<int, int, int>
     {
+        __device__ __forceinline__ int operator ()(int a, int b) const
+        {
+            int res = 0;
+
+        #if __CUDA_ARCH__ >= 300
+            asm("vmax2.s32.s32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #elif __CUDA_ARCH__ >= 200
+            asm("vmax.s32.s32.s32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+            asm("vmax.s32.s32.s32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(res));
+        #endif
+
+            return res;
+        }
+
+        __device__ __forceinline__ VMax2() {}
+        __device__ __forceinline__ VMax2(const VMax2& other) {}
     };
-    template <typename T> struct TransformFunctorTraits< binder2nd< minimum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< minimum<T> > >
-    {
-    };
-    template <typename T> struct TransformFunctorTraits< binder2nd< maximum<T> > > : detail::MinMaxTraits< sizeof(T), binder2nd< maximum<T> > >
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct TransformFunctorTraits< VMax4<T> > : DefaultTransformFunctorTraits< VMax4<T> >
     {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
     };
 
-    template <typename T>
-    void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    ////////////////////////////////////
+
+    template <typename T> struct TransformFunctorTraits< VMax2<T> > : DefaultTransformFunctorTraits< VMax2<T> >
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, minimum<T>(), WithOutMask(), stream);
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+
+    ////////////////////////////////////
+
+    template <> struct TransformFunctorTraits< maximum<ushort> > : DefaultTransformFunctorTraits< maximum<ushort> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< maximum<short> > : DefaultTransformFunctorTraits< maximum<short> >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< maximum<int> > : DefaultTransformFunctorTraits< maximum<int> >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< maximum<float> > : DefaultTransformFunctorTraits< maximum<float> >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+
+    template <> struct TransformFunctorTraits< binder2nd< maximum<ushort> > > : DefaultTransformFunctorTraits< binder2nd< maximum<ushort> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< maximum<short> > > : DefaultTransformFunctorTraits< binder2nd< maximum<short> > >
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< maximum<int> > > : DefaultTransformFunctorTraits< binder2nd< maximum<int> > >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits< binder2nd< maximum<float> > > : DefaultTransformFunctorTraits< binder2nd< maximum<float> > >
+    {
+        enum { smart_block_dim_y = 4 };
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace arithm
+{
+    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    {
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax4<T>(), WithOutMask(), stream);
     }
 
-    template void min_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<int   >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src1, (PtrStepSz<T>)src2, (PtrStepSz<T>)dst, maximum<T>(), WithOutMask(), stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, VMax2<T>(), WithOutMask(), stream);
     }
 
-    template void max_gpu<uchar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<schar >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<short >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<int   >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<float >(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<double>(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-
-    template <typename T>
-    void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(minimum<T>(), val), WithOutMask(), stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) src2, (PtrStepSz<T>) dst, maximum<T>(), WithOutMask(), stream);
     }
 
-    template void min_gpu<uchar >(const PtrStepSzb src, uchar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<schar >(const PtrStepSzb src, schar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<short >(const PtrStepSzb src, short  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<int   >(const PtrStepSzb src, int    val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<float >(const PtrStepSzb src, float  val, PtrStepSzb dst, cudaStream_t stream);
-    template void min_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void vmax4<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmax4<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T>
-    void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream)
+    template void vmax2<uint>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void vmax2<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, device::bind2nd(maximum<T>(), val), WithOutMask(), stream);
+        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
     }
 
-    template void max_gpu<uchar >(const PtrStepSzb src, uchar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<schar >(const PtrStepSzb src, schar  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<ushort>(const PtrStepSzb src, ushort val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<short >(const PtrStepSzb src, short  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<int   >(const PtrStepSzb src, int    val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<float >(const PtrStepSzb src, float  val, PtrStepSzb dst, cudaStream_t stream);
-    template void max_gpu<double>(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // threshold
+//////////////////////////////////////////////////////////////////////////
+// threshold
 
+namespace cv { namespace gpu { namespace device
+{
     namespace detail
     {
         template <size_t size, typename F> struct ThresholdTraits : DefaultTransformFunctorTraits<F>
@@ -1874,19 +3021,21 @@ namespace cv { namespace gpu { namespace device
     template <typename T> struct TransformFunctorTraits< thresh_to_zero_inv_func<T> > : detail::ThresholdTraits< sizeof(T), thresh_to_zero_inv_func<T> >
     {
     };
+}}}
 
+namespace arithm
+{
     template <template <typename> class Op, typename T>
-    void threshold_caller(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream)
+    void threshold_caller(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream)
     {
         Op<T> op(thresh, maxVal);
-        cv::gpu::device::transform(src, dst, op, WithOutMask(), stream);
+        transform(src, dst, op, WithOutMask(), stream);
     }
 
     template <typename T>
-    void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type,
-        cudaStream_t stream)
+    void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream)
     {
-        typedef void (*caller_t)(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, T thresh, T maxVal, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> dst, T thresh, T maxVal, cudaStream_t stream);
 
         static const caller_t callers[] =
         {
@@ -1897,23 +3046,26 @@ namespace cv { namespace gpu { namespace device
             threshold_caller<thresh_to_zero_inv_func, T>
         };
 
-        callers[type]((PtrStepSz<T>)src, (PtrStepSz<T>)dst, thresh, maxVal, stream);
+        callers[type]((PtrStepSz<T>) src, (PtrStepSz<T>) dst, static_cast<T>(thresh), static_cast<T>(maxVal), stream);
     }
 
-    template void threshold_gpu<uchar>(const PtrStepSzb& src, const PtrStepSzb& dst, uchar thresh, uchar maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<schar>(const PtrStepSzb& src, const PtrStepSzb& dst, schar thresh, schar maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<ushort>(const PtrStepSzb& src, const PtrStepSzb& dst, ushort thresh, ushort maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<short>(const PtrStepSzb& src, const PtrStepSzb& dst, short thresh, short maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<int>(const PtrStepSzb& src, const PtrStepSzb& dst, int thresh, int maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<float>(const PtrStepSzb& src, const PtrStepSzb& dst, float thresh, float maxVal, int type, cudaStream_t stream);
-    template void threshold_gpu<double>(const PtrStepSzb& src, const PtrStepSzb& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+    template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // pow
+//////////////////////////////////////////////////////////////////////////
+// pow
 
-    template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
+namespace
+{
+    template<typename T, bool Signed = numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
     {
-        const float power;
+        float power;
 
         PowOp(double power_) : power(static_cast<float>(power_)) {}
 
@@ -1924,7 +3076,7 @@ namespace cv { namespace gpu { namespace device
     };
     template<typename T> struct PowOp<T, true> : unary_function<T, T>
     {
-        const float power;
+        float power;
 
         PowOp(double power_) : power(static_cast<float>(power_)) {}
 
@@ -1951,7 +3103,7 @@ namespace cv { namespace gpu { namespace device
     };
     template<> struct PowOp<double> : unary_function<double, double>
     {
-        const double power;
+        double power;
 
         PowOp(double power_) : power(power_) {}
 
@@ -1960,7 +3112,10 @@ namespace cv { namespace gpu { namespace device
             return ::pow(::fabs(e), power);
         }
     };
+}
 
+namespace cv { namespace gpu { namespace device
+{
     namespace detail
     {
         template <size_t size, typename T> struct PowOpTraits : DefaultTransformFunctorTraits< PowOp<T> >
@@ -1985,83 +3140,78 @@ namespace cv { namespace gpu { namespace device
     template <typename T> struct TransformFunctorTraits< PowOp<T> > : detail::PowOpTraits<sizeof(T), T>
     {
     };
+}}}
 
+namespace arithm
+{
     template<typename T>
-    void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
+    void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream)
     {
-        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, PowOp<T>(power), WithOutMask(), stream);
+        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
     }
 
-    template void pow_caller<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-    template void pow_caller<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+    template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    // addWeighted
+//////////////////////////////////////////////////////////////////////////
+// addWeighted
 
-    namespace detail
+namespace
+{
+    template <typename T> struct UseDouble_
     {
-        template <typename T> struct UseDouble
-        {
-            enum {value = 0};
-        };
-        template <> struct UseDouble<int>
-        {
-            enum {value = 1};
-        };
-        template <> struct UseDouble<float>
-        {
-            enum {value = 1};
-        };
-        template <> struct UseDouble<double>
-        {
-            enum {value = 1};
-        };
-    }
+        enum {value = 0};
+    };
+    template <> struct UseDouble_<double>
+    {
+        enum {value = 1};
+    };
     template <typename T1, typename T2, typename D> struct UseDouble
     {
-        enum {value = (detail::UseDouble<T1>::value || detail::UseDouble<T2>::value || detail::UseDouble<D>::value)};
+        enum {value = (UseDouble_<T1>::value || UseDouble_<T2>::value || UseDouble_<D>::value)};
     };
 
-    namespace detail
+    template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted_;
+    template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, false> : binary_function<T1, T2, D>
     {
-        template <typename T1, typename T2, typename D, bool useDouble> struct AddWeighted;
-        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, false> : binary_function<T1, T2, D>
+        float alpha;
+        float beta;
+        float gamma;
+
+        AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
+
+        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
         {
-            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
-
-            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
-            {
-                return saturate_cast<D>(a * alpha + b * beta + gamma);
-            }
-
-            const float alpha;
-            const float beta;
-            const float gamma;
-        };
-        template <typename T1, typename T2, typename D> struct AddWeighted<T1, T2, D, true> : binary_function<T1, T2, D>
-        {
-            AddWeighted(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
-
-            __device__ __forceinline__ D operator ()(T1 a, T2 b) const
-            {
-                return saturate_cast<D>(a * alpha + b * beta + gamma);
-            }
-
-            const double alpha;
-            const double beta;
-            const double gamma;
-        };
-    }
-    template <typename T1, typename T2, typename D> struct AddWeighted : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>
-    {
-        AddWeighted(double alpha_, double beta_, double gamma_) : detail::AddWeighted<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
+            return saturate_cast<D>(a * alpha + b * beta + gamma);
+        }
     };
+    template <typename T1, typename T2, typename D> struct AddWeighted_<T1, T2, D, true> : binary_function<T1, T2, D>
+    {
+        double alpha;
+        double beta;
+        double gamma;
 
+        AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+
+        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
+        {
+            return saturate_cast<D>(a * alpha + b * beta + gamma);
+        }
+    };
+    template <typename T1, typename T2, typename D> struct AddWeighted : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>
+    {
+        AddWeighted(double alpha_, double beta_, double gamma_) : AddWeighted_<T1, T2, D, UseDouble<T1, T2, D>::value>(alpha_, beta_, gamma_) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace device
+{
     template <> struct TransformFunctorTraits< AddWeighted<ushort, ushort, ushort> > : DefaultTransformFunctorTraits< AddWeighted<ushort, ushort, ushort> >
     {
         enum { smart_shift = 4 };
@@ -2117,257 +3267,253 @@ namespace cv { namespace gpu { namespace device
         enum { smart_block_dim_y = 8 };
         enum { smart_shift = 4 };
     };
+}}}
 
+namespace arithm
+{
     template <typename T1, typename T2, typename D>
-    void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream)
+    void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream)
     {
-        if (UseDouble<T1, T2, D>::value)
-        {
-            cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-            cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-            cudaSafeCall( cudaSetDoubleForDevice(&gamma) );
-        }
-
         AddWeighted<T1, T2, D> op(alpha, beta, gamma);
 
-        cv::gpu::device::transform(static_cast< PtrStepSz<T1> >(src1), static_cast< PtrStepSz<T2> >(src2), static_cast< PtrStepSz<D> >(dst), op, WithOutMask(), stream);
+        transform((PtrStepSz<T1>) src1, (PtrStepSz<T2>) src2, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
     }
 
-    template void addWeighted_gpu<uchar, uchar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, uchar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, uchar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<uchar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<uchar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<schar, schar, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, schar, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, schar, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<schar, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<schar, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<schar, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<ushort, ushort, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, ushort, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, ushort, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<ushort, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<ushort, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<short, short, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, short, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, short, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<short, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<short, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<short, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<short, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<short, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<int, int, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, int, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, int, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<int, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<int, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<int, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<int, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<int, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<float, float, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, float, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
-    template void addWeighted_gpu<float, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<float, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
+    template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
 
 
 
-    template void addWeighted_gpu<double, double, uchar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, schar>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, ushort>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, short>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, int>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, float>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-    template void addWeighted_gpu<double, double, double>(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-}}} // namespace cv { namespace gpu { namespace device
+    template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+    template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index 2a22b2ffb..1943b315d 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -99,58 +99,6 @@ namespace
     template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
     template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
 
-    template <int DEPTH> struct NppArithmFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, const npp_t* pSrc2, int nSrc2Step, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
-    };
-    template <> struct NppArithmFunc<CV_32F>
-    {
-        typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const Npp32f* pSrc1, int nSrc1Step, const Npp32f* pSrc2, int nSrc2Step, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-
-    template <int DEPTH, typename NppArithmFunc<DEPTH>::func_t func> struct NppArithm
-    {
-        typedef typename NppArithmFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz, 0) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename NppArithmFunc<CV_32F>::func_t func> struct NppArithm<CV_32F, func>
-    {
-        typedef typename NppArithmFunc<CV_32F>::npp_t npp_t;
-
-        static void call(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
     template<int DEPTH, int cn> struct NppArithmScalarFunc
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
@@ -313,87 +261,294 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // add
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T, typename D>
-    void add_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    void vadd4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
     template <typename T, typename D>
-    void add_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+    void vadd2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void addMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
 void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {add_gpu<unsigned char, unsigned char>       , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short>    , add_gpu<unsigned char, short>       , add_gpu<unsigned char, int>    , add_gpu<unsigned char, float>    , add_gpu<unsigned char, double>    },
-        {0 /*add_gpu<signed char, unsigned char>*/   , 0 /*add_gpu<signed char, signed char>*/   , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/   , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/},
-        {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short>   , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int>   , add_gpu<unsigned short, float>   , add_gpu<unsigned short, double>   },
-        {0 /*add_gpu<short, unsigned char>*/         , 0 /*add_gpu<short, signed char>*/         , 0 /*add_gpu<short, unsigned short>*/      , add_gpu<short, short>               , add_gpu<short, int>            , add_gpu<short, float>            , add_gpu<short, double>            },
-        {0 /*add_gpu<int, unsigned char>*/           , 0 /*add_gpu<int, signed char>*/           , 0 /*add_gpu<int, unsigned short>*/        , 0 /*add_gpu<int, short>*/           , add_gpu<int, int>              , add_gpu<int, float>              , add_gpu<int, double>              },
-        {0 /*add_gpu<float, unsigned char>*/         , 0 /*add_gpu<float, signed char>*/         , 0 /*add_gpu<float, unsigned short>*/      , 0 /*add_gpu<float, short>*/         , 0 /*add_gpu<float, int>*/      , add_gpu<float, float>            , add_gpu<float, double>            },
-        {0 /*add_gpu<double, unsigned char>*/        , 0 /*add_gpu<double, signed char>*/        , 0 /*add_gpu<double, unsigned short>*/     , 0 /*add_gpu<double, short>*/        , 0 /*add_gpu<double, int>*/     , 0 /*add_gpu<double, float>*/     , add_gpu<double, double>           }
+        {
+            addMat<unsigned char, unsigned char>,
+            addMat<unsigned char, signed char>,
+            addMat<unsigned char, unsigned short>,
+            addMat<unsigned char, short>,
+            addMat<unsigned char, int>,
+            addMat<unsigned char, float>,
+            addMat<unsigned char, double>
+        },
+        {
+            addMat<signed char, unsigned char>,
+            addMat<signed char, signed char>,
+            addMat<signed char, unsigned short>,
+            addMat<signed char, short>,
+            addMat<signed char, int>,
+            addMat<signed char, float>,
+            addMat<signed char, double>
+        },
+        {
+            0 /*addMat<unsigned short, unsigned char>*/,
+            0 /*addMat<unsigned short, signed char>*/,
+            addMat<unsigned short, unsigned short>,
+            addMat<unsigned short, short>,
+            addMat<unsigned short, int>,
+            addMat<unsigned short, float>,
+            addMat<unsigned short, double>
+        },
+        {
+            0 /*addMat<short, unsigned char>*/,
+            0 /*addMat<short, signed char>*/,
+            addMat<short, unsigned short>,
+            addMat<short, short>,
+            addMat<short, int>,
+            addMat<short, float>,
+            addMat<short, double>
+        },
+        {
+            0 /*addMat<int, unsigned char>*/,
+            0 /*addMat<int, signed char>*/,
+            0 /*addMat<int, unsigned short>*/,
+            0 /*addMat<int, short>*/,
+            addMat<int, int>,
+            addMat<int, float>,
+            addMat<int, double>
+        },
+        {
+            0 /*addMat<float, unsigned char>*/,
+            0 /*addMat<float, signed char>*/,
+            0 /*addMat<float, unsigned short>*/,
+            0 /*addMat<float, short>*/,
+            0 /*addMat<float, int>*/,
+            addMat<float, float>,
+            addMat<float, double>
+        },
+        {
+            0 /*addMat<double, unsigned char>*/,
+            0 /*addMat<double, signed char>*/,
+            0 /*addMat<double, unsigned short>*/,
+            0 /*addMat<double, short>*/,
+            0 /*addMat<double, int>*/,
+            0 /*addMat<double, float>*/,
+            addMat<double, double>
+        }
     };
 
-    typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[] =
+    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    static const vfunc_t vfuncs4[4][4] =
     {
-        NppArithm<CV_8U , nppiAdd_8u_C1RSfs >::call,
-        0,
-        NppArithm<CV_16U, nppiAdd_16u_C1RSfs>::call,
-        NppArithm<CV_16S, nppiAdd_16s_C1RSfs>::call,
-        NppArithm<CV_32S, nppiAdd_32s_C1RSfs>::call,
-        NppArithm<CV_32F, nppiAdd_32f_C1R   >::call
+        {
+            vadd4<unsigned int, unsigned int>,
+            vadd4<unsigned int, int>,
+            0,
+            0
+        },
+        {
+            vadd4<int, unsigned int>,
+            vadd4<int, int>,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        }
+    };
+    static const vfunc_t vfuncs2[4][4] =
+    {
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            vadd2<unsigned int, unsigned int>,
+            vadd2<unsigned int, int>
+        },
+        {
+            0,
+            0,
+            vadd2<int, unsigned int>,
+            vadd2<int, int>
+        }
     };
 
     if (dtype < 0)
         dtype = src1.depth();
 
-    CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
-    CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));
+    const int sdepth = src1.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F)
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
     {
-        npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-        return;
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
+            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
     }
 
-    const func_t func = funcs[src1.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream);
+    func(src1_, src2_, dst_, mask, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
 void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {add_gpu<unsigned char, unsigned char>       , 0 /*add_gpu<unsigned char, signed char>*/ , add_gpu<unsigned char, unsigned short>    , add_gpu<unsigned char, short>       , add_gpu<unsigned char, int>    , add_gpu<unsigned char, float>    , add_gpu<unsigned char, double>    },
-        {0 /*add_gpu<signed char, unsigned char>*/   , 0 /*add_gpu<signed char, signed char>*/   , 0 /*add_gpu<signed char, unsigned short>*/, 0 /*add_gpu<signed char, short>*/   , 0 /*add_gpu<signed char, int>*/, 0 /*add_gpu<signed char, float>*/, 0 /*add_gpu<signed char, double>*/},
-        {0 /*add_gpu<unsigned short, unsigned char>*/, 0 /*add_gpu<unsigned short, signed char>*/, add_gpu<unsigned short, unsigned short>   , 0 /*add_gpu<unsigned short, short>*/, add_gpu<unsigned short, int>   , add_gpu<unsigned short, float>   , add_gpu<unsigned short, double>   },
-        {0 /*add_gpu<short, unsigned char>*/         , 0 /*add_gpu<short, signed char>*/         , 0 /*add_gpu<short, unsigned short>*/      , add_gpu<short, short>               , add_gpu<short, int>            , add_gpu<short, float>            , add_gpu<short, double>            },
-        {0 /*add_gpu<int, unsigned char>*/           , 0 /*add_gpu<int, signed char>*/           , 0 /*add_gpu<int, unsigned short>*/        , 0 /*add_gpu<int, short>*/           , add_gpu<int, int>              , add_gpu<int, float>              , add_gpu<int, double>              },
-        {0 /*add_gpu<float, unsigned char>*/         , 0 /*add_gpu<float, signed char>*/         , 0 /*add_gpu<float, unsigned short>*/      , 0 /*add_gpu<float, short>*/         , 0 /*add_gpu<float, int>*/      , add_gpu<float, float>            , add_gpu<float, double>            },
-        {0 /*add_gpu<double, unsigned char>*/        , 0 /*add_gpu<double, signed char>*/        , 0 /*add_gpu<double, unsigned short>*/     , 0 /*add_gpu<double, short>*/        , 0 /*add_gpu<double, int>*/     , 0 /*add_gpu<double, float>*/     , add_gpu<double, double>           }
+        {
+            addScalar<unsigned char, float, unsigned char>,
+            addScalar<unsigned char, float, signed char>,
+            addScalar<unsigned char, float, unsigned short>,
+            addScalar<unsigned char, float, short>,
+            addScalar<unsigned char, float, int>,
+            addScalar<unsigned char, float, float>,
+            addScalar<unsigned char, double, double>
+        },
+        {
+            addScalar<signed char, float, unsigned char>,
+            addScalar<signed char, float, signed char>,
+            addScalar<signed char, float, unsigned short>,
+            addScalar<signed char, float, short>,
+            addScalar<signed char, float, int>,
+            addScalar<signed char, float, float>,
+            addScalar<signed char, double, double>
+        },
+        {
+            0 /*addScalar<unsigned short, float, unsigned char>*/,
+            0 /*addScalar<unsigned short, float, signed char>*/,
+            addScalar<unsigned short, float, unsigned short>,
+            addScalar<unsigned short, float, short>,
+            addScalar<unsigned short, float, int>,
+            addScalar<unsigned short, float, float>,
+            addScalar<unsigned short, double, double>
+        },
+        {
+            0 /*addScalar<short, float, unsigned char>*/,
+            0 /*addScalar<short, float, signed char>*/,
+            addScalar<short, float, unsigned short>,
+            addScalar<short, float, short>,
+            addScalar<short, float, int>,
+            addScalar<short, float, float>,
+            addScalar<short, double, double>
+        },
+        {
+            0 /*addScalar<int, float, unsigned char>*/,
+            0 /*addScalar<int, float, signed char>*/,
+            0 /*addScalar<int, float, unsigned short>*/,
+            0 /*addScalar<int, float, short>*/,
+            addScalar<int, float, int>,
+            addScalar<int, float, float>,
+            addScalar<int, double, double>
+        },
+        {
+            0 /*addScalar<float, float, unsigned char>*/,
+            0 /*addScalar<float, float, signed char>*/,
+            0 /*addScalar<float, float, unsigned short>*/,
+            0 /*addScalar<float, float, short>*/,
+            0 /*addScalar<float, float, int>*/,
+            addScalar<float, float, float>,
+            addScalar<float, double, double>
+        },
+        {
+            0 /*addScalar<double, double, unsigned char>*/,
+            0 /*addScalar<double, double, signed char>*/,
+            0 /*addScalar<double, double, unsigned short>*/,
+            0 /*addScalar<double, double, short>*/,
+            0 /*addScalar<double, double, int>*/,
+            0 /*addScalar<double, double, float>*/,
+            addScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -411,34 +566,34 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
-    CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U));
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src.type())
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
     {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
-
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+        npp_func(src, sc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
@@ -449,87 +604,294 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 ////////////////////////////////////////////////////////////////////////
 // subtract
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T, typename D>
-    void subtract_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    void vsub4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
     template <typename T, typename D>
-    void subtract_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+    void vsub2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <typename T, typename D>
+    void subMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
 void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {subtract_gpu<unsigned char, unsigned char>       , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short>    , subtract_gpu<unsigned char, short>       , subtract_gpu<unsigned char, int>    , subtract_gpu<unsigned char, float>    , subtract_gpu<unsigned char, double>    },
-        {0 /*subtract_gpu<signed char, unsigned char>*/   , 0 /*subtract_gpu<signed char, signed char>*/   , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/   , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/},
-        {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short>   , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int>   , subtract_gpu<unsigned short, float>   , subtract_gpu<unsigned short, double>   },
-        {0 /*subtract_gpu<short, unsigned char>*/         , 0 /*subtract_gpu<short, signed char>*/         , 0 /*subtract_gpu<short, unsigned short>*/      , subtract_gpu<short, short>               , subtract_gpu<short, int>            , subtract_gpu<short, float>            , subtract_gpu<short, double>            },
-        {0 /*subtract_gpu<int, unsigned char>*/           , 0 /*subtract_gpu<int, signed char>*/           , 0 /*subtract_gpu<int, unsigned short>*/        , 0 /*subtract_gpu<int, short>*/           , subtract_gpu<int, int>              , subtract_gpu<int, float>              , subtract_gpu<int, double>              },
-        {0 /*subtract_gpu<float, unsigned char>*/         , 0 /*subtract_gpu<float, signed char>*/         , 0 /*subtract_gpu<float, unsigned short>*/      , 0 /*subtract_gpu<float, short>*/         , 0 /*subtract_gpu<float, int>*/      , subtract_gpu<float, float>            , subtract_gpu<float, double>            },
-        {0 /*subtract_gpu<double, unsigned char>*/        , 0 /*subtract_gpu<double, signed char>*/        , 0 /*subtract_gpu<double, unsigned short>*/     , 0 /*subtract_gpu<double, short>*/        , 0 /*subtract_gpu<double, int>*/     , 0 /*subtract_gpu<double, float>*/     , subtract_gpu<double, double>           }
+        {
+            subMat<unsigned char, unsigned char>,
+            subMat<unsigned char, signed char>,
+            subMat<unsigned char, unsigned short>,
+            subMat<unsigned char, short>,
+            subMat<unsigned char, int>,
+            subMat<unsigned char, float>,
+            subMat<unsigned char, double>
+        },
+        {
+            subMat<signed char, unsigned char>,
+            subMat<signed char, signed char>,
+            subMat<signed char, unsigned short>,
+            subMat<signed char, short>,
+            subMat<signed char, int>,
+            subMat<signed char, float>,
+            subMat<signed char, double>
+        },
+        {
+            0 /*subMat<unsigned short, unsigned char>*/,
+            0 /*subMat<unsigned short, signed char>*/,
+            subMat<unsigned short, unsigned short>,
+            subMat<unsigned short, short>,
+            subMat<unsigned short, int>,
+            subMat<unsigned short, float>,
+            subMat<unsigned short, double>
+        },
+        {
+            0 /*subMat<short, unsigned char>*/,
+            0 /*subMat<short, signed char>*/,
+            subMat<short, unsigned short>,
+            subMat<short, short>,
+            subMat<short, int>,
+            subMat<short, float>,
+            subMat<short, double>
+        },
+        {
+            0 /*subMat<int, unsigned char>*/,
+            0 /*subMat<int, signed char>*/,
+            0 /*subMat<int, unsigned short>*/,
+            0 /*subMat<int, short>*/,
+            subMat<int, int>,
+            subMat<int, float>,
+            subMat<int, double>
+        },
+        {
+            0 /*subMat<float, unsigned char>*/,
+            0 /*subMat<float, signed char>*/,
+            0 /*subMat<float, unsigned short>*/,
+            0 /*subMat<float, short>*/,
+            0 /*subMat<float, int>*/,
+            subMat<float, float>,
+            subMat<float, double>
+        },
+        {
+            0 /*subMat<double, unsigned char>*/,
+            0 /*subMat<double, signed char>*/,
+            0 /*subMat<double, unsigned short>*/,
+            0 /*subMat<double, short>*/,
+            0 /*subMat<double, int>*/,
+            0 /*subMat<double, float>*/,
+            subMat<double, double>
+        }
     };
 
-    typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-    static const npp_func_t npp_funcs[6] =
+    typedef void (*vfunc_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    static const vfunc_t vfuncs4[4][4] =
     {
-        NppArithm<CV_8U , nppiSub_8u_C1RSfs>::call,
-        0,
-        NppArithm<CV_16U, nppiSub_16u_C1RSfs>::call,
-        NppArithm<CV_16S, nppiSub_16s_C1RSfs>::call,
-        NppArithm<CV_32S, nppiSub_32s_C1RSfs>::call,
-        NppArithm<CV_32F, nppiSub_32f_C1R   >::call
+        {
+            vsub4<unsigned int, unsigned int>,
+            vsub4<unsigned int, int>,
+            0,
+            0
+        },
+        {
+            vsub4<int, unsigned int>,
+            vsub4<int, int>,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        }
+    };
+    static const vfunc_t vfuncs2[4][4] =
+    {
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            0,
+            0
+        },
+        {
+            0,
+            0,
+            vsub2<unsigned int, unsigned int>,
+            vsub2<unsigned int, int>
+        },
+        {
+            0,
+            0,
+            vsub2<int, unsigned int>,
+            vsub2<int, int>
+        }
     };
 
     if (dtype < 0)
         dtype = src1.depth();
 
-    CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
-    CV_Assert(mask.empty() || (src1.channels() == 1 && mask.size() == src1.size() && mask.type() == CV_8U));
+    const int sdepth = src1.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src1.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src1.type() && src1.depth() <= CV_32F)
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (mask.empty() && sdepth < CV_32S && ddepth < CV_32S)
     {
-        npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream);
-        return;
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const vfunc_t vfunc4 = vfuncs4[sdepth][ddepth];
+            const vfunc_t vfunc2 = vfuncs2[sdepth][ddepth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
     }
 
-    const func_t func = funcs[src1.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src1.reshape(1), src2.reshape(1), dst.reshape(1), mask, stream);
+    func(src1_, src2_, dst_, mask, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void subScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
 void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, const PtrStepb& mask, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {subtract_gpu<unsigned char, unsigned char>       , 0 /*subtract_gpu<unsigned char, signed char>*/ , subtract_gpu<unsigned char, unsigned short>    , subtract_gpu<unsigned char, short>       , subtract_gpu<unsigned char, int>    , subtract_gpu<unsigned char, float>    , subtract_gpu<unsigned char, double>    },
-        {0 /*subtract_gpu<signed char, unsigned char>*/   , 0 /*subtract_gpu<signed char, signed char>*/   , 0 /*subtract_gpu<signed char, unsigned short>*/, 0 /*subtract_gpu<signed char, short>*/   , 0 /*subtract_gpu<signed char, int>*/, 0 /*subtract_gpu<signed char, float>*/, 0 /*subtract_gpu<signed char, double>*/},
-        {0 /*subtract_gpu<unsigned short, unsigned char>*/, 0 /*subtract_gpu<unsigned short, signed char>*/, subtract_gpu<unsigned short, unsigned short>   , 0 /*subtract_gpu<unsigned short, short>*/, subtract_gpu<unsigned short, int>   , subtract_gpu<unsigned short, float>   , subtract_gpu<unsigned short, double>   },
-        {0 /*subtract_gpu<short, unsigned char>*/         , 0 /*subtract_gpu<short, signed char>*/         , 0 /*subtract_gpu<short, unsigned short>*/      , subtract_gpu<short, short>               , subtract_gpu<short, int>            , subtract_gpu<short, float>            , subtract_gpu<short, double>            },
-        {0 /*subtract_gpu<int, unsigned char>*/           , 0 /*subtract_gpu<int, signed char>*/           , 0 /*subtract_gpu<int, unsigned short>*/        , 0 /*subtract_gpu<int, short>*/           , subtract_gpu<int, int>              , subtract_gpu<int, float>              , subtract_gpu<int, double>              },
-        {0 /*subtract_gpu<float, unsigned char>*/         , 0 /*subtract_gpu<float, signed char>*/         , 0 /*subtract_gpu<float, unsigned short>*/      , 0 /*subtract_gpu<float, short>*/         , 0 /*subtract_gpu<float, int>*/      , subtract_gpu<float, float>            , subtract_gpu<float, double>            },
-        {0 /*subtract_gpu<double, unsigned char>*/        , 0 /*subtract_gpu<double, signed char>*/        , 0 /*subtract_gpu<double, unsigned short>*/     , 0 /*subtract_gpu<double, short>*/        , 0 /*subtract_gpu<double, int>*/     , 0 /*subtract_gpu<double, float>*/     , subtract_gpu<double, double>           }
+        {
+            subScalar<unsigned char, float, unsigned char>,
+            subScalar<unsigned char, float, signed char>,
+            subScalar<unsigned char, float, unsigned short>,
+            subScalar<unsigned char, float, short>,
+            subScalar<unsigned char, float, int>,
+            subScalar<unsigned char, float, float>,
+            subScalar<unsigned char, double, double>
+        },
+        {
+            subScalar<signed char, float, unsigned char>,
+            subScalar<signed char, float, signed char>,
+            subScalar<signed char, float, unsigned short>,
+            subScalar<signed char, float, short>,
+            subScalar<signed char, float, int>,
+            subScalar<signed char, float, float>,
+            subScalar<signed char, double, double>
+        },
+        {
+            0 /*subScalar<unsigned short, float, unsigned char>*/,
+            0 /*subScalar<unsigned short, float, signed char>*/,
+            subScalar<unsigned short, float, unsigned short>,
+            subScalar<unsigned short, float, short>,
+            subScalar<unsigned short, float, int>,
+            subScalar<unsigned short, float, float>,
+            subScalar<unsigned short, double, double>
+        },
+        {
+            0 /*subScalar<short, float, unsigned char>*/,
+            0 /*subScalar<short, float, signed char>*/,
+            subScalar<short, float, unsigned short>,
+            subScalar<short, float, short>,
+            subScalar<short, float, int>,
+            subScalar<short, float, float>,
+            subScalar<short, double, double>
+        },
+        {
+            0 /*subScalar<int, float, unsigned char>*/,
+            0 /*subScalar<int, float, signed char>*/,
+            0 /*subScalar<int, float, unsigned short>*/,
+            0 /*subScalar<int, float, short>*/,
+            subScalar<int, float, int>,
+            subScalar<int, float, float>,
+            subScalar<int, double, double>
+        },
+        {
+            0 /*subScalar<float, float, unsigned char>*/,
+            0 /*subScalar<float, float, signed char>*/,
+            0 /*subScalar<float, float, unsigned short>*/,
+            0 /*subScalar<float, float, short>*/,
+            0 /*subScalar<float, float, int>*/,
+            subScalar<float, float, float>,
+            subScalar<float, double, double>
+        },
+        {
+            0 /*subScalar<double, double, unsigned char>*/,
+            0 /*subScalar<double, double, signed char>*/,
+            0 /*subScalar<double, double, unsigned short>*/,
+            0 /*subScalar<double, double, short>*/,
+            0 /*subScalar<double, double, int>*/,
+            0 /*subScalar<double, double, float>*/,
+            subScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -547,34 +909,34 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
-    CV_Assert(mask.empty() || (src.channels() == 1 && mask.size() == src.size() && mask.type() == CV_8U));
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( mask.empty() || (cn == 1 && mask.size() == src.size() && mask.type() == CV_8U) );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (mask.empty() && dst.type() == src.type())
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
     {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
-
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+        npp_func(src, sc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
@@ -585,120 +947,215 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
 ////////////////////////////////////////////////////////////////////////
 // multiply
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void multiply_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
-    void multiply_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);
+    void mulMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void multiply_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    void mulMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void multiply_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-}}}
+    template <typename T, typename S, typename D>
+    void mulMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
 
 void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        multiply_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
+        mulMat_8uc4_32f(src1, src2, dst, stream);
     }
     else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        multiply_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
+        mulMat_16sc4_32f(src1, src2, dst, stream);
     }
     else
     {
-        typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
         static const func_t funcs[7][7] =
         {
-            {multiply_gpu<unsigned char, unsigned char>       , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short>    , multiply_gpu<unsigned char, short>       , multiply_gpu<unsigned char, int>    , multiply_gpu<unsigned char, float>    , multiply_gpu<unsigned char, double>    },
-            {0 /*multiply_gpu<signed char, unsigned char>*/   , 0 /*multiply_gpu<signed char, signed char>*/   , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/   , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/},
-            {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short>   , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int>   , multiply_gpu<unsigned short, float>   , multiply_gpu<unsigned short, double>   },
-            {0 /*multiply_gpu<short, unsigned char>*/         , 0 /*multiply_gpu<short, signed char>*/         , 0 /*multiply_gpu<short, unsigned short>*/      , multiply_gpu<short, short>               , multiply_gpu<short, int>            , multiply_gpu<short, float>            , multiply_gpu<short, double>            },
-            {0 /*multiply_gpu<int, unsigned char>*/           , 0 /*multiply_gpu<int, signed char>*/           , 0 /*multiply_gpu<int, unsigned short>*/        , 0 /*multiply_gpu<int, short>*/           , multiply_gpu<int, int>              , multiply_gpu<int, float>              , multiply_gpu<int, double>              },
-            {0 /*multiply_gpu<float, unsigned char>*/         , 0 /*multiply_gpu<float, signed char>*/         , 0 /*multiply_gpu<float, unsigned short>*/      , 0 /*multiply_gpu<float, short>*/         , 0 /*multiply_gpu<float, int>*/      , multiply_gpu<float, float>            , multiply_gpu<float, double>            },
-            {0 /*multiply_gpu<double, unsigned char>*/        , 0 /*multiply_gpu<double, signed char>*/        , 0 /*multiply_gpu<double, unsigned short>*/     , 0 /*multiply_gpu<double, short>*/        , 0 /*multiply_gpu<double, int>*/     , 0 /*multiply_gpu<double, float>*/     , multiply_gpu<double, double>           }
-        };
-
-        typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-        static const npp_func_t npp_funcs[] =
-        {
-            NppArithm<CV_8U , nppiMul_8u_C1RSfs >::call,
-            0,
-            NppArithm<CV_16U, nppiMul_16u_C1RSfs>::call,
-            NppArithm<CV_16S, nppiMul_16s_C1RSfs>::call,
-            NppArithm<CV_32S, nppiMul_32s_C1RSfs>::call,
-            NppArithm<CV_32F, nppiMul_32f_C1R   >::call
+            {
+                mulMat<unsigned char, float, unsigned char>,
+                mulMat<unsigned char, float, signed char>,
+                mulMat<unsigned char, float, unsigned short>,
+                mulMat<unsigned char, float, short>,
+                mulMat<unsigned char, float, int>,
+                mulMat<unsigned char, float, float>,
+                mulMat<unsigned char, double, double>
+            },
+            {
+                mulMat<signed char, float, unsigned char>,
+                mulMat<signed char, float, signed char>,
+                mulMat<signed char, float, unsigned short>,
+                mulMat<signed char, float, short>,
+                mulMat<signed char, float, int>,
+                mulMat<signed char, float, float>,
+                mulMat<signed char, double, double>
+            },
+            {
+                0 /*mulMat<unsigned short, float, unsigned char>*/,
+                0 /*mulMat<unsigned short, float, signed char>*/,
+                mulMat<unsigned short, float, unsigned short>,
+                mulMat<unsigned short, float, short>,
+                mulMat<unsigned short, float, int>,
+                mulMat<unsigned short, float, float>,
+                mulMat<unsigned short, double, double>
+            },
+            {
+                0 /*mulMat<short, float, unsigned char>*/,
+                0 /*mulMat<short, float, signed char>*/,
+                mulMat<short, float, unsigned short>,
+                mulMat<short, float, short>,
+                mulMat<short, float, int>,
+                mulMat<short, float, float>,
+                mulMat<short, double, double>
+            },
+            {
+                0 /*mulMat<int, float, unsigned char>*/,
+                0 /*mulMat<int, float, signed char>*/,
+                0 /*mulMat<int, float, unsigned short>*/,
+                0 /*mulMat<int, float, short>*/,
+                mulMat<int, float, int>,
+                mulMat<int, float, float>,
+                mulMat<int, double, double>
+            },
+            {
+                0 /*mulMat<float, float, unsigned char>*/,
+                0 /*mulMat<float, float, signed char>*/,
+                0 /*mulMat<float, float, unsigned short>*/,
+                0 /*mulMat<float, float, short>*/,
+                0 /*mulMat<float, float, int>*/,
+                mulMat<float, float, float>,
+                mulMat<float, double, double>
+            },
+            {
+                0 /*mulMat<double, double, unsigned char>*/,
+                0 /*mulMat<double, double, signed char>*/,
+                0 /*mulMat<double, double, unsigned short>*/,
+                0 /*mulMat<double, double, short>*/,
+                0 /*mulMat<double, double, int>*/,
+                0 /*mulMat<double, double, float>*/,
+                mulMat<double, double, double>
+            }
         };
 
         if (dtype < 0)
             dtype = src1.depth();
 
-        CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-        CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
+        const int sdepth = src1.depth();
+        const int ddepth = CV_MAT_DEPTH(dtype);
+        const int cn = src1.channels();
 
-        if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+        CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+        if (sdepth == CV_64F || ddepth == CV_64F)
         {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+        dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
-#if (CUDA_VERSION <= 4020)
-        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
-#else
-        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F && src1.depth() > CV_8U)
-#endif
-        {
-            npp_funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
-            return;
-        }
+        PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+        PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+        PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-        const func_t func = funcs[src1.depth()][dst.depth()];
+        const func_t func = funcs[sdepth][ddepth];
 
         if (!func)
             CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream);
+        func(src1_, src2_, dst_, scale, stream);
     }
 }
 
-namespace
+namespace arithm
 {
-    inline bool isIntScalar(Scalar sc)
-    {
-        return sc.val[0] == static_cast<int>(sc.val[0]) && sc.val[1] == static_cast<int>(sc.val[1]) && sc.val[2] == static_cast<int>(sc.val[2]) && sc.val[3] == static_cast<int>(sc.val[3]);
-    }
+    template <typename T, typename S, typename D>
+    void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {multiply_gpu<unsigned char, unsigned char>       , 0 /*multiply_gpu<unsigned char, signed char>*/ , multiply_gpu<unsigned char, unsigned short>    , multiply_gpu<unsigned char, short>       , multiply_gpu<unsigned char, int>    , multiply_gpu<unsigned char, float>    , multiply_gpu<unsigned char, double>    },
-        {0 /*multiply_gpu<signed char, unsigned char>*/   , 0 /*multiply_gpu<signed char, signed char>*/   , 0 /*multiply_gpu<signed char, unsigned short>*/, 0 /*multiply_gpu<signed char, short>*/   , 0 /*multiply_gpu<signed char, int>*/, 0 /*multiply_gpu<signed char, float>*/, 0 /*multiply_gpu<signed char, double>*/},
-        {0 /*multiply_gpu<unsigned short, unsigned char>*/, 0 /*multiply_gpu<unsigned short, signed char>*/, multiply_gpu<unsigned short, unsigned short>   , 0 /*multiply_gpu<unsigned short, short>*/, multiply_gpu<unsigned short, int>   , multiply_gpu<unsigned short, float>   , multiply_gpu<unsigned short, double>   },
-        {0 /*multiply_gpu<short, unsigned char>*/         , 0 /*multiply_gpu<short, signed char>*/         , 0 /*multiply_gpu<short, unsigned short>*/      , multiply_gpu<short, short>               , multiply_gpu<short, int>            , multiply_gpu<short, float>            , multiply_gpu<short, double>            },
-        {0 /*multiply_gpu<int, unsigned char>*/           , 0 /*multiply_gpu<int, signed char>*/           , 0 /*multiply_gpu<int, unsigned short>*/        , 0 /*multiply_gpu<int, short>*/           , multiply_gpu<int, int>              , multiply_gpu<int, float>              , multiply_gpu<int, double>              },
-        {0 /*multiply_gpu<float, unsigned char>*/         , 0 /*multiply_gpu<float, signed char>*/         , 0 /*multiply_gpu<float, unsigned short>*/      , 0 /*multiply_gpu<float, short>*/         , 0 /*multiply_gpu<float, int>*/      , multiply_gpu<float, float>            , multiply_gpu<float, double>            },
-        {0 /*multiply_gpu<double, unsigned char>*/        , 0 /*multiply_gpu<double, signed char>*/        , 0 /*multiply_gpu<double, unsigned short>*/     , 0 /*multiply_gpu<double, short>*/        , 0 /*multiply_gpu<double, int>*/     , 0 /*multiply_gpu<double, float>*/     , multiply_gpu<double, double>           }
+        {
+            mulScalar<unsigned char, float, unsigned char>,
+            mulScalar<unsigned char, float, signed char>,
+            mulScalar<unsigned char, float, unsigned short>,
+            mulScalar<unsigned char, float, short>,
+            mulScalar<unsigned char, float, int>,
+            mulScalar<unsigned char, float, float>,
+            mulScalar<unsigned char, double, double>
+        },
+        {
+            mulScalar<signed char, float, unsigned char>,
+            mulScalar<signed char, float, signed char>,
+            mulScalar<signed char, float, unsigned short>,
+            mulScalar<signed char, float, short>,
+            mulScalar<signed char, float, int>,
+            mulScalar<signed char, float, float>,
+            mulScalar<signed char, double, double>
+        },
+        {
+            0 /*mulScalar<unsigned short, float, unsigned char>*/,
+            0 /*mulScalar<unsigned short, float, signed char>*/,
+            mulScalar<unsigned short, float, unsigned short>,
+            mulScalar<unsigned short, float, short>,
+            mulScalar<unsigned short, float, int>,
+            mulScalar<unsigned short, float, float>,
+            mulScalar<unsigned short, double, double>
+        },
+        {
+            0 /*mulScalar<short, float, unsigned char>*/,
+            0 /*mulScalar<short, float, signed char>*/,
+            mulScalar<short, float, unsigned short>,
+            mulScalar<short, float, short>,
+            mulScalar<short, float, int>,
+            mulScalar<short, float, float>,
+            mulScalar<short, double, double>
+        },
+        {
+            0 /*mulScalar<int, float, unsigned char>*/,
+            0 /*mulScalar<int, float, signed char>*/,
+            0 /*mulScalar<int, float, unsigned short>*/,
+            0 /*mulScalar<int, float, short>*/,
+            mulScalar<int, float, int>,
+            mulScalar<int, float, float>,
+            mulScalar<int, double, double>
+        },
+        {
+            0 /*mulScalar<float, float, unsigned char>*/,
+            0 /*mulScalar<float, float, signed char>*/,
+            0 /*mulScalar<float, float, unsigned short>*/,
+            0 /*mulScalar<float, float, short>*/,
+            0 /*mulScalar<float, float, int>*/,
+            mulScalar<float, float, float>,
+            mulScalar<float, double, double>
+        },
+        {
+            0 /*mulScalar<double, double, unsigned char>*/,
+            0 /*mulScalar<double, double, signed char>*/,
+            0 /*mulScalar<double, double, unsigned short>*/,
+            0 /*mulScalar<double, double, short>*/,
+            0 /*mulScalar<double, double, int>*/,
+            0 /*mulScalar<double, double, float>*/,
+            mulScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -716,148 +1173,254 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc)))
-    {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
+    const Scalar nsc(sc.val[0] * scale, sc.val[1] * scale, sc.val[2] * scale, sc.val[3] * scale);
 
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
+    {
+        npp_func(src, nsc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, sc.val[0], dst, scale, stream);
+    func(src, nsc.val[0], dst, stream);
 }
 
 ////////////////////////////////////////////////////////////////////////
 // divide
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void divide_gpu(const PtrStepSz<uchar4>& src1, const PtrStepSzf& src2, const PtrStepSz<uchar4>& dst, cudaStream_t stream);
-    void divide_gpu(const PtrStepSz<short4>& src1, const PtrStepSzf& src2, const PtrStepSz<short4>& dst, cudaStream_t stream);
+    void divMat_8uc4_32f(PtrStepSz<unsigned int> src1, PtrStepSzf src2, PtrStepSz<unsigned int> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void divide_gpu(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    void divMat_16sc4_32f(PtrStepSz<short4> src1, PtrStepSzf src2, PtrStepSz<short4> dst, cudaStream_t stream);
 
-    template <typename T, typename D>
-    void divide_gpu(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
-
-    template <typename T, typename D>
-    void divide_gpu(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
-}}}
+    template <typename T, typename S, typename D>
+    void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+}
 
 void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
     if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        divide_gpu(static_cast<PtrStepSz<uchar4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<uchar4> >(dst), stream);
+        divMat_8uc4_32f(src1, src2, dst, stream);
     }
     else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
     {
-        CV_Assert(src1.size() == src2.size());
+        CV_Assert( src1.size() == src2.size() );
 
         dst.create(src1.size(), src1.type());
 
-        divide_gpu(static_cast<PtrStepSz<short4> >(src1), static_cast<PtrStepSzf>(src2), static_cast<PtrStepSz<short4> >(dst), stream);
+        divMat_16sc4_32f(src1, src2, dst, stream);
     }
     else
     {
-        typedef void (*func_t)(const PtrStepSzb& src1, const PtrStepSzb& src2, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
         static const func_t funcs[7][7] =
         {
-            {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
-            {0 /*divide_gpu<signed char, unsigned char>*/   , 0 /*divide_gpu<signed char, signed char>*/   , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/   , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
-            {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short>   , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int>   , divide_gpu<unsigned short, float>   , divide_gpu<unsigned short, double>   },
-            {0 /*divide_gpu<short, unsigned char>*/         , 0 /*divide_gpu<short, signed char>*/         , 0 /*divide_gpu<short, unsigned short>*/      , divide_gpu<short, short>               , divide_gpu<short, int>            , divide_gpu<short, float>            , divide_gpu<short, double>            },
-            {0 /*divide_gpu<int, unsigned char>*/           , 0 /*divide_gpu<int, signed char>*/           , 0 /*divide_gpu<int, unsigned short>*/        , 0 /*divide_gpu<int, short>*/           , divide_gpu<int, int>              , divide_gpu<int, float>              , divide_gpu<int, double>              },
-            {0 /*divide_gpu<float, unsigned char>*/         , 0 /*divide_gpu<float, signed char>*/         , 0 /*divide_gpu<float, unsigned short>*/      , 0 /*divide_gpu<float, short>*/         , 0 /*divide_gpu<float, int>*/      , divide_gpu<float, float>            , divide_gpu<float, double>            },
-            {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
-        };
-
-        typedef void (*npp_func_t)(const PtrStepSzb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-        static const npp_func_t npp_funcs[6] =
-        {
-            NppArithm<CV_8U , nppiDiv_8u_C1RSfs >::call,
-            0,
-            NppArithm<CV_16U, nppiDiv_16u_C1RSfs>::call,
-            NppArithm<CV_16S, nppiDiv_16s_C1RSfs>::call,
-            NppArithm<CV_32S, nppiDiv_32s_C1RSfs>::call,
-            NppArithm<CV_32F, nppiDiv_32f_C1R   >::call
+            {
+                divMat<unsigned char, float, unsigned char>,
+                divMat<unsigned char, float, signed char>,
+                divMat<unsigned char, float, unsigned short>,
+                divMat<unsigned char, float, short>,
+                divMat<unsigned char, float, int>,
+                divMat<unsigned char, float, float>,
+                divMat<unsigned char, double, double>
+            },
+            {
+                divMat<signed char, float, unsigned char>,
+                divMat<signed char, float, signed char>,
+                divMat<signed char, float, unsigned short>,
+                divMat<signed char, float, short>,
+                divMat<signed char, float, int>,
+                divMat<signed char, float, float>,
+                divMat<signed char, double, double>
+            },
+            {
+                0 /*divMat<unsigned short, float, unsigned char>*/,
+                0 /*divMat<unsigned short, float, signed char>*/,
+                divMat<unsigned short, float, unsigned short>,
+                divMat<unsigned short, float, short>,
+                divMat<unsigned short, float, int>,
+                divMat<unsigned short, float, float>,
+                divMat<unsigned short, double, double>
+            },
+            {
+                0 /*divMat<short, float, unsigned char>*/,
+                0 /*divMat<short, float, signed char>*/,
+                divMat<short, float, unsigned short>,
+                divMat<short, float, short>,
+                divMat<short, float, int>,
+                divMat<short, float, float>,
+                divMat<short, double, double>
+            },
+            {
+                0 /*divMat<int, float, unsigned char>*/,
+                0 /*divMat<int, float, signed char>*/,
+                0 /*divMat<int, float, unsigned short>*/,
+                0 /*divMat<int, float, short>*/,
+                divMat<int, float, int>,
+                divMat<int, float, float>,
+                divMat<int, double, double>
+            },
+            {
+                0 /*divMat<float, float, unsigned char>*/,
+                0 /*divMat<float, float, signed char>*/,
+                0 /*divMat<float, float, unsigned short>*/,
+                0 /*divMat<float, float, short>*/,
+                0 /*divMat<float, float, int>*/,
+                divMat<float, float, float>,
+                divMat<float, double, double>
+            },
+            {
+                0 /*divMat<double, double, unsigned char>*/,
+                0 /*divMat<double, double, signed char>*/,
+                0 /*divMat<double, double, unsigned short>*/,
+                0 /*divMat<double, double, short>*/,
+                0 /*divMat<double, double, int>*/,
+                0 /*divMat<double, double, float>*/,
+                divMat<double, double, double>
+            }
         };
 
         if (dtype < 0)
             dtype = src1.depth();
 
-        CV_Assert(src1.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-        CV_Assert(src1.type() == src2.type() && src1.size() == src2.size());
+        const int sdepth = src1.depth();
+        const int ddepth = CV_MAT_DEPTH(dtype);
+        const int cn = src1.channels();
 
-        if (src1.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+        CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+        if (sdepth == CV_64F || ddepth == CV_64F)
         {
-            if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            if (!deviceSupports(NATIVE_DOUBLE))
                 CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
         }
 
-        dst.create(src1.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src1.channels()));
+        dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
-        if (scale == 1 && dst.type() == src1.type() && src1.depth() <= CV_32F)
-        {
-            npp_funcs[src1.depth()](src2.reshape(1), src1.reshape(1), dst.reshape(1), stream);
-            return;
-        }
+        PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+        PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+        PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-        const func_t func = funcs[src1.depth()][dst.depth()];
+        const func_t func = funcs[sdepth][ddepth];
 
         if (!func)
             CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-        func(src1.reshape(1), src2.reshape(1), dst.reshape(1), scale, stream);
+        func(src1_, src2_, dst_, scale, stream);
     }
 }
 
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+}
+
 void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb& src1, double val, const PtrStepSzb& dst, double scale, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
-        {0 /*divide_gpu<signed char, unsigned char>*/   , 0 /*divide_gpu<signed char, signed char>*/   , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/   , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
-        {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short>   , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int>   , divide_gpu<unsigned short, float>   , divide_gpu<unsigned short, double>   },
-        {0 /*divide_gpu<short, unsigned char>*/         , 0 /*divide_gpu<short, signed char>*/         , 0 /*divide_gpu<short, unsigned short>*/      , divide_gpu<short, short>               , divide_gpu<short, int>            , divide_gpu<short, float>            , divide_gpu<short, double>            },
-        {0 /*divide_gpu<int, unsigned char>*/           , 0 /*divide_gpu<int, signed char>*/           , 0 /*divide_gpu<int, unsigned short>*/        , 0 /*divide_gpu<int, short>*/           , divide_gpu<int, int>              , divide_gpu<int, float>              , divide_gpu<int, double>              },
-        {0 /*divide_gpu<float, unsigned char>*/         , 0 /*divide_gpu<float, signed char>*/         , 0 /*divide_gpu<float, unsigned short>*/      , 0 /*divide_gpu<float, short>*/         , 0 /*divide_gpu<float, int>*/      , divide_gpu<float, float>            , divide_gpu<float, double>            },
-        {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
+        {
+            divScalar<unsigned char, float, unsigned char>,
+            divScalar<unsigned char, float, signed char>,
+            divScalar<unsigned char, float, unsigned short>,
+            divScalar<unsigned char, float, short>,
+            divScalar<unsigned char, float, int>,
+            divScalar<unsigned char, float, float>,
+            divScalar<unsigned char, double, double>
+        },
+        {
+            divScalar<signed char, float, unsigned char>,
+            divScalar<signed char, float, signed char>,
+            divScalar<signed char, float, unsigned short>,
+            divScalar<signed char, float, short>,
+            divScalar<signed char, float, int>,
+            divScalar<signed char, float, float>,
+            divScalar<signed char, double, double>
+        },
+        {
+            0 /*divScalar<unsigned short, float, unsigned char>*/,
+            0 /*divScalar<unsigned short, float, signed char>*/,
+            divScalar<unsigned short, float, unsigned short>,
+            divScalar<unsigned short, float, short>,
+            divScalar<unsigned short, float, int>,
+            divScalar<unsigned short, float, float>,
+            divScalar<unsigned short, double, double>
+        },
+        {
+            0 /*divScalar<short, float, unsigned char>*/,
+            0 /*divScalar<short, float, signed char>*/,
+            divScalar<short, float, unsigned short>,
+            divScalar<short, float, short>,
+            divScalar<short, float, int>,
+            divScalar<short, float, float>,
+            divScalar<short, double, double>
+        },
+        {
+            0 /*divScalar<int, float, unsigned char>*/,
+            0 /*divScalar<int, float, signed char>*/,
+            0 /*divScalar<int, float, unsigned short>*/,
+            0 /*divScalar<int, float, short>*/,
+            divScalar<int, float, int>,
+            divScalar<int, float, float>,
+            divScalar<int, double, double>
+        },
+        {
+            0 /*divScalar<float, float, unsigned char>*/,
+            0 /*divScalar<float, float, signed char>*/,
+            0 /*divScalar<float, float, unsigned short>*/,
+            0 /*divScalar<float, float, short>*/,
+            0 /*divScalar<float, float, int>*/,
+            divScalar<float, float, float>,
+            divScalar<float, double, double>
+        },
+        {
+            0 /*divScalar<double, double, unsigned char>*/,
+            0 /*divScalar<double, double, signed char>*/,
+            0 /*divScalar<double, double, unsigned short>*/,
+            0 /*divScalar<double, double, short>*/,
+            0 /*divScalar<double, double, int>*/,
+            0 /*divScalar<double, double, float>*/,
+            divScalar<double, double, double>
+        }
     };
 
     typedef void (*npp_func_t)(const PtrStepSzb src, Scalar sc, PtrStepb dst, cudaStream_t stream);
@@ -875,536 +1438,547 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() <= 4);
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn <= 4 );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    if (dst.type() == src.type() && scale == 1 && (src.depth() == CV_32F || isIntScalar(sc)))
-    {
-        const npp_func_t npp_func = npp_funcs[src.depth()][src.channels() - 1];
+    const Scalar nsc(sc.val[0] / scale, sc.val[1] / scale, sc.val[2] / scale, sc.val[3] / scale);
 
-        if (npp_func)
-        {
-            npp_func(src, sc, dst, stream);
-            return;
-        }
+    const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
+    if (ddepth == sdepth && cn > 1 && npp_func != 0)
+    {
+        npp_func(src, nsc, dst, stream);
+        return;
     }
 
-    CV_Assert(src.channels() == 1);
+    CV_Assert( cn == 1 );
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, sc.val[0], dst, scale, stream);
+    func(src, nsc.val[0], dst, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S, typename D>
+    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(double scalar, const PtrStepSzb& src2, const PtrStepSzb& dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
-        {divide_gpu<unsigned char, unsigned char>       , 0 /*divide_gpu<unsigned char, signed char>*/ , divide_gpu<unsigned char, unsigned short>    , divide_gpu<unsigned char, short>       , divide_gpu<unsigned char, int>    , divide_gpu<unsigned char, float>    , divide_gpu<unsigned char, double>    },
-        {0 /*divide_gpu<signed char, unsigned char>*/   , 0 /*divide_gpu<signed char, signed char>*/   , 0 /*divide_gpu<signed char, unsigned short>*/, 0 /*divide_gpu<signed char, short>*/   , 0 /*divide_gpu<signed char, int>*/, 0 /*divide_gpu<signed char, float>*/, 0 /*divide_gpu<signed char, double>*/},
-        {0 /*divide_gpu<unsigned short, unsigned char>*/, 0 /*divide_gpu<unsigned short, signed char>*/, divide_gpu<unsigned short, unsigned short>   , 0 /*divide_gpu<unsigned short, short>*/, divide_gpu<unsigned short, int>   , divide_gpu<unsigned short, float>   , divide_gpu<unsigned short, double>   },
-        {0 /*divide_gpu<short, unsigned char>*/         , 0 /*divide_gpu<short, signed char>*/         , 0 /*divide_gpu<short, unsigned short>*/      , divide_gpu<short, short>               , divide_gpu<short, int>            , divide_gpu<short, float>            , divide_gpu<short, double>            },
-        {0 /*divide_gpu<int, unsigned char>*/           , 0 /*divide_gpu<int, signed char>*/           , 0 /*divide_gpu<int, unsigned short>*/        , 0 /*divide_gpu<int, short>*/           , divide_gpu<int, int>              , divide_gpu<int, float>              , divide_gpu<int, double>              },
-        {0 /*divide_gpu<float, unsigned char>*/         , 0 /*divide_gpu<float, signed char>*/         , 0 /*divide_gpu<float, unsigned short>*/      , 0 /*divide_gpu<float, short>*/         , 0 /*divide_gpu<float, int>*/      , divide_gpu<float, float>            , divide_gpu<float, double>            },
-        {0 /*divide_gpu<double, unsigned char>*/        , 0 /*divide_gpu<double, signed char>*/        , 0 /*divide_gpu<double, unsigned short>*/     , 0 /*divide_gpu<double, short>*/        , 0 /*divide_gpu<double, int>*/     , 0 /*divide_gpu<double, float>*/     , divide_gpu<double, double>           }
+        {
+            divInv<unsigned char, float, unsigned char>,
+            divInv<unsigned char, float, signed char>,
+            divInv<unsigned char, float, unsigned short>,
+            divInv<unsigned char, float, short>,
+            divInv<unsigned char, float, int>,
+            divInv<unsigned char, float, float>,
+            divInv<unsigned char, double, double>
+        },
+        {
+            divInv<signed char, float, unsigned char>,
+            divInv<signed char, float, signed char>,
+            divInv<signed char, float, unsigned short>,
+            divInv<signed char, float, short>,
+            divInv<signed char, float, int>,
+            divInv<signed char, float, float>,
+            divInv<signed char, double, double>
+        },
+        {
+            0 /*divInv<unsigned short, float, unsigned char>*/,
+            0 /*divInv<unsigned short, float, signed char>*/,
+            divInv<unsigned short, float, unsigned short>,
+            divInv<unsigned short, float, short>,
+            divInv<unsigned short, float, int>,
+            divInv<unsigned short, float, float>,
+            divInv<unsigned short, double, double>
+        },
+        {
+            0 /*divInv<short, float, unsigned char>*/,
+            0 /*divInv<short, float, signed char>*/,
+            divInv<short, float, unsigned short>,
+            divInv<short, float, short>,
+            divInv<short, float, int>,
+            divInv<short, float, float>,
+            divInv<short, double, double>
+        },
+        {
+            0 /*divInv<int, float, unsigned char>*/,
+            0 /*divInv<int, float, signed char>*/,
+            0 /*divInv<int, float, unsigned short>*/,
+            0 /*divInv<int, float, short>*/,
+            divInv<int, float, int>,
+            divInv<int, float, float>,
+            divInv<int, double, double>
+        },
+        {
+            0 /*divInv<float, float, unsigned char>*/,
+            0 /*divInv<float, float, signed char>*/,
+            0 /*divInv<float, float, unsigned short>*/,
+            0 /*divInv<float, float, short>*/,
+            0 /*divInv<float, float, int>*/,
+            divInv<float, float, float>,
+            divInv<float, double, double>
+        },
+        {
+            0 /*divInv<double, double, unsigned char>*/,
+            0 /*divInv<double, double, signed char>*/,
+            0 /*divInv<double, double, unsigned short>*/,
+            0 /*divInv<double, double, short>*/,
+            0 /*divInv<double, double, int>*/,
+            0 /*divInv<double, double, float>*/,
+            divInv<double, double, double>
+        }
     };
 
     if (dtype < 0)
         dtype = src.depth();
 
-    CV_Assert(src.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-    CV_Assert(src.channels() == 1);
+    const int sdepth = src.depth();
+    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
+    CV_Assert( cn == 1 );
+
+    if (sdepth == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
 
     cudaStream_t stream = StreamAccessor::getStream(s);
 
-    const func_t func = funcs[src.depth()][dst.depth()];
+    const func_t func = funcs[sdepth][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(scale, src, dst, stream);
+    func(src, scale, dst, stream);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // absdiff
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T>
-    void absdiff_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    void vabsDiff4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
     template <typename T>
-    void absdiff_gpu(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}}}
+    void vabsDiff2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 
-namespace
-{
-    template <int DEPTH> struct NppAbsDiffFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* src1, int src1_step, const npp_t* src2, int src2_step, npp_t* dst, int dst_step, NppiSize sz);
-    };
-
-    template <int DEPTH, typename NppAbsDiffFunc<DEPTH>::func_t func> struct NppAbsDiff
-    {
-        typedef typename NppAbsDiffFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step), (const npp_t*)src2.data, static_cast<int>(src2.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <int DEPTH> struct NppAbsDiffCFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-        typedef npp_t scalar_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc1, int nSrc1Step, npp_t* pDst,  int nDstStep,  NppiSize oSizeROI, npp_t nConstant);
-    };
-    template <> struct NppAbsDiffCFunc<CV_16U>
-    {
-        typedef NppTypeTraits<CV_16U>::npp_t npp_t;
-        typedef Npp32u scalar_t;
-
-#if (CUDA_VERSION <= 4020)
-        typedef NppStatus (*func_t)(const Npp16u* pSrc1, int nSrc1Step, Npp16u* pDst, int nDstStep, NppiSize oSizeROI, Npp32u nConstant);
-#else
-        typedef NppStatus (*func_t)(const Npp16u * pSrc1, int nSrc1Step, Npp16u * pDst,  int nDstStep,  NppiSize oSizeROI, Npp16u nConstant);
-#endif
-    };
-
-    template <int DEPTH, typename NppAbsDiffCFunc<DEPTH>::func_t func> struct NppAbsDiffC
-    {
-        typedef typename NppAbsDiffCFunc<DEPTH>::npp_t npp_t;
-        typedef typename NppAbsDiffCFunc<DEPTH>::scalar_t scalar_t;
-
-        static void call(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src1.cols;
-            sz.height = src1.rows;
-
-            nppSafeCall( func((const npp_t*)src1.data, static_cast<int>(src1.step),
-                              (npp_t*)dst.data, static_cast<int>(dst.step), sz, static_cast<scalar_t>(val)) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+    template <typename T>
+    void absDiffMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppAbsDiff<CV_8U, nppiAbsDiff_8u_C1R>::call,
-        absdiff_gpu<signed char>,
-        NppAbsDiff<CV_16U, nppiAbsDiff_16u_C1R>::call,
-        absdiff_gpu<short>,
-        absdiff_gpu<int>,
-        NppAbsDiff<CV_32F, nppiAbsDiff_32f_C1R>::call,
-        absdiff_gpu<double>
+        absDiffMat<unsigned char>,
+        absDiffMat<signed char>,
+        absDiffMat<unsigned short>,
+        absDiffMat<short>,
+        absDiffMat<int>,
+        absDiffMat<float>,
+        absDiffMat<double>
+    };
+    static const func_t vfuncs4[] =
+    {
+        vabsDiff4<unsigned int>,
+        vabsDiff4<int>,
+        0,
+        0
+    };
+    static const func_t vfuncs2[] =
+    {
+        0,
+        0,
+        vabsDiff2<unsigned int>,
+        vabsDiff2<int>
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    const int depth = src1.depth();
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src1.size(), src1.type());
 
-    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (depth < CV_32S)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const func_t vfunc4 = vfuncs4[depth];
+            const func_t vfunc2 = vfuncs2[depth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
+}
+
+namespace arithm
+{
+    template <typename T, typename S>
+    void absDiffScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& stream)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppAbsDiffC<CV_8U, nppiAbsDiffC_8u_C1R>::call,
-        absdiff_gpu<signed char>,
-        NppAbsDiffC<CV_16U, nppiAbsDiffC_16u_C1R>::call,
-        absdiff_gpu<short>,
-        absdiff_gpu<int>,
-        NppAbsDiffC<CV_32F, nppiAbsDiffC_32f_C1R>::call,
-        absdiff_gpu<double>
+        absDiffScalar<unsigned char, float>,
+        absDiffScalar<signed char, float>,
+        absDiffScalar<unsigned short, float>,
+        absDiffScalar<short, float>,
+        absDiffScalar<int, float>,
+        absDiffScalar<float, float>,
+        absDiffScalar<double, double>
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.channels() == 1);
+    const int depth = src1.depth();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src1.channels() == 1 );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src1.size(), src1.type());
 
-    funcs[src1.depth()](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
+    funcs[depth](src1, src2.val[0], dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // abs
 
-void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& s)
+namespace arithm
 {
-    CV_Assert(src.depth() == CV_16S || src.depth() == CV_32F);
+    template <typename T>
+    void absMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
+void cv::gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        absMat<unsigned char>,
+        absMat<signed char>,
+        absMat<unsigned short>,
+        absMat<short>,
+        absMat<int>,
+        absMat<float>,
+        absMat<double>
+    };
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    NppiSize oSizeROI;
-    oSizeROI.width = src.cols * src.channels();
-    oSizeROI.height = src.rows;
-
-    bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-
-    if (src.depth() == CV_16S)
-    {
-        if (aligned && oSizeROI.width % 4 == 0)
-        {
-            oSizeROI.width /= 4;
-            nppSafeCall( nppiAbs_16s_C4R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-        else
-        {
-            nppSafeCall( nppiAbs_16s_C1R(src.ptr<Npp16s>(), static_cast<int>(src.step), dst.ptr<Npp16s>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-    }
-    else
-    {
-        if (aligned && oSizeROI.width % 4 == 0)
-        {
-            oSizeROI.width /= 4;
-            nppSafeCall( nppiAbs_32f_C4R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-        else
-        {
-            nppSafeCall( nppiAbs_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), oSizeROI) );
-        }
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // sqr
 
-namespace
+namespace arithm
 {
-    template <int DEPTH> struct NppSqrFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
-    };
-    template <> struct NppSqrFunc<CV_32F>
-    {
-        typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-
-    template <int DEPTH, typename NppSqrFunc<DEPTH>::func_t func, typename NppSqrFunc<DEPTH>::func_t func_c4> struct NppSqr
-    {
-        typedef typename NppSqrFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-
-            if (aligned && oSizeROI.width % 4 == 0)
-            {
-                oSizeROI.width /= 4;
-                nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
-            }
-            else
-            {
-                nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename NppSqrFunc<CV_32F>::func_t func, typename NppSqrFunc<CV_32F>::func_t func_c4> struct NppSqr<CV_32F, func, func_c4>
-    {
-        typedef NppSqrFunc<CV_32F>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-
-            if (aligned && oSizeROI.width % 4 == 0)
-            {
-                oSizeROI.width /= 4;
-                nppSafeCall( func_c4(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-            }
-            else
-            {
-                nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+    template <typename T>
+    void sqrMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppSqr<CV_8U, nppiSqr_8u_C1RSfs, nppiSqr_8u_C4RSfs>::call,
-        0,
-        NppSqr<CV_16U, nppiSqr_16u_C1RSfs, nppiSqr_16u_C4RSfs>::call,
-        NppSqr<CV_16S, nppiSqr_16s_C1RSfs, nppiSqr_16s_C4RSfs>::call,
-        0,
-        NppSqr<CV_32F, nppiSqr_32f_C1R, nppiSqr_32f_C4R>::call
+        sqrMat<unsigned char>,
+        sqrMat<signed char>,
+        sqrMat<unsigned short>,
+        sqrMat<short>,
+        sqrMat<int>,
+        sqrMat<float>,
+        sqrMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // sqrt
 
-namespace
+namespace arithm
 {
-    template <int DEPTH> struct NppOneSourceFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI, int nScaleFactor);
-    };
-    template <> struct NppOneSourceFunc<CV_32F>
-    {
-        typedef NppTypeTraits<CV_32F>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-
-    template <int DEPTH, typename NppOneSourceFunc<DEPTH>::func_t func> struct NppOneSource
-    {
-        typedef typename NppOneSourceFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI, 0) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename NppOneSourceFunc<CV_32F>::func_t func> struct NppOneSource<CV_32F, func>
-    {
-        typedef NppOneSourceFunc<CV_32F>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize oSizeROI;
-            oSizeROI.width = src.cols * src.channels();
-            oSizeROI.height = src.rows;
-
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step), dst.ptr<npp_t>(), static_cast<int>(dst.step), oSizeROI) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+    template <typename T>
+    void sqrtMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
 }
 
 void cv::gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppOneSource<CV_8U, nppiSqrt_8u_C1RSfs>::call,
-        0,
-        NppOneSource<CV_16U, nppiSqrt_16u_C1RSfs>::call,
-        NppOneSource<CV_16S, nppiSqrt_16s_C1RSfs>::call,
-        0,
-        NppOneSource<CV_32F, nppiSqrt_32f_C1R>::call
+        sqrtMat<unsigned char>,
+        sqrtMat<signed char>,
+        sqrtMat<unsigned short>,
+        sqrtMat<short>,
+        sqrtMat<int>,
+        sqrtMat<float>,
+        sqrtMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
 // log
 
+namespace arithm
+{
+    template <typename T>
+    void logMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
 void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppOneSource<CV_8U, nppiLn_8u_C1RSfs>::call,
-        0,
-        NppOneSource<CV_16U, nppiLn_16u_C1RSfs>::call,
-        NppOneSource<CV_16S, nppiLn_16s_C1RSfs>::call,
-        0,
-        NppOneSource<CV_32F, nppiLn_32f_C1R>::call
+        logMat<unsigned char>,
+        logMat<signed char>,
+        logMat<unsigned short>,
+        logMat<short>,
+        logMat<int>,
+        logMat<float>,
+        logMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
 // exp
 
+namespace arithm
+{
+    template <typename T>
+    void expMat(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+}
+
 void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
+    using namespace arithm;
 
+    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        NppOneSource<CV_8U, nppiExp_8u_C1RSfs>::call,
-        0,
-        NppOneSource<CV_16U, nppiExp_16u_C1RSfs>::call,
-        NppOneSource<CV_16S, nppiExp_16s_C1RSfs>::call,
-        0,
-        NppOneSource<CV_32F, nppiExp_32f_C1R>::call
+        expMat<unsigned char>,
+        expMat<signed char>,
+        expMat<unsigned short>,
+        expMat<short>,
+        expMat<int>,
+        expMat<float>,
+        expMat<double>
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_16S || src.depth() == CV_32F);
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// Comparison of two matrixes
+// compare
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    template <typename T> void compare_eq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_ne(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_lt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_le(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatEq(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatNe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatLt(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpMatLe(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+}
 
-    template <typename T> void compare_eq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_ne(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_lt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_le(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_gt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void compare_ge(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
-}}}
-
-void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
+void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][4] =
     {
-        {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> },
-        {compare_eq<signed char>   , compare_ne<signed char>   , compare_lt<signed char>   , compare_le<signed char>   },
-        {compare_eq<unsigned short>, compare_ne<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>},
-        {compare_eq<short>         , compare_ne<short>         , compare_lt<short>         , compare_le<short>         },
-        {compare_eq<int>           , compare_ne<int>           , compare_lt<int>           , compare_le<int>           },
-        {compare_eq<float>         , compare_ne<float>         , compare_lt<float>         , compare_le<float>         },
-        {compare_eq<double>        , compare_ne<double>        , compare_lt<double>        , compare_le<double>        }
+        {cmpMatEq<unsigned char> , cmpMatNe<unsigned char> , cmpMatLt<unsigned char> , cmpMatLe<unsigned char> },
+        {cmpMatEq<signed char>   , cmpMatNe<signed char>   , cmpMatLt<signed char>   , cmpMatLe<signed char>   },
+        {cmpMatEq<unsigned short>, cmpMatNe<unsigned short>, cmpMatLt<unsigned short>, cmpMatLe<unsigned short>},
+        {cmpMatEq<short>         , cmpMatNe<short>         , cmpMatLt<short>         , cmpMatLe<short>         },
+        {cmpMatEq<int>           , cmpMatNe<int>           , cmpMatLt<int>           , cmpMatLe<int>           },
+        {cmpMatEq<float>         , cmpMatNe<float>         , cmpMatLt<float>         , cmpMatLe<float>         },
+        {cmpMatEq<double>        , cmpMatNe<double>        , cmpMatLt<double>        , cmpMatLe<double>        }
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+    const int depth = src1.depth();
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
+    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
     static const int codes[] =
     {
         0, 2, 3, 2, 3, 1
@@ -1418,15 +1992,29 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
         &src2, &src1, &src1, &src2, &src2, &src2
     };
 
-    dst.create(src1.size(), CV_MAKE_TYPE(CV_8U, src1.channels()));
+    const int code = codes[cmpop];
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, psrc1[cmpop]->data, psrc1[cmpop]->step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, psrc2[cmpop]->data, psrc2[cmpop]->step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    funcs[src1.depth()][codes[cmpop]](psrc1[cmpop]->reshape(1), psrc2[cmpop]->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    const func_t func = funcs[depth][code];
+
+    func(src1_, src2_, dst_, stream);
+}
+
+namespace arithm
+{
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
 }
 
 namespace
 {
-    template <typename T>
-    void castScalar(Scalar& sc)
+    template <typename T> void castScalar(Scalar& sc)
     {
         sc.val[0] = saturate_cast<T>(sc.val[0]);
         sc.val[1] = saturate_cast<T>(sc.val[1]);
@@ -1437,18 +2025,18 @@ namespace
 
 void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
     typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][6] =
     {
-        {compare_eq<unsigned char> , compare_gt<unsigned char> , compare_ge<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> , compare_ne<unsigned char> },
-        {compare_eq<signed char>   , compare_gt<signed char>   , compare_ge<signed char>   , compare_lt<signed char>   , compare_le<signed char>   , compare_ne<signed char>   },
-        {compare_eq<unsigned short>, compare_gt<unsigned short>, compare_ge<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>, compare_ne<unsigned short>},
-        {compare_eq<short>         , compare_gt<short>         , compare_ge<short>         , compare_lt<short>         , compare_le<short>         , compare_ne<short>         },
-        {compare_eq<int>           , compare_gt<int>           , compare_ge<int>           , compare_lt<int>           , compare_le<int>           , compare_ne<int>           },
-        {compare_eq<float>         , compare_gt<float>         , compare_ge<float>         , compare_lt<float>         , compare_le<float>         , compare_ne<float>         },
-        {compare_eq<double>        , compare_gt<double>        , compare_ge<double>        , compare_lt<double>        , compare_le<double>        , compare_ne<double>        }
+        {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
+        {cmpScalarEq<signed char>   , cmpScalarGt<signed char>   , cmpScalarGe<signed char>   , cmpScalarLt<signed char>   , cmpScalarLe<signed char>   , cmpScalarNe<signed char>   },
+        {cmpScalarEq<unsigned short>, cmpScalarGt<unsigned short>, cmpScalarGe<unsigned short>, cmpScalarLt<unsigned short>, cmpScalarLe<unsigned short>, cmpScalarNe<unsigned short>},
+        {cmpScalarEq<short>         , cmpScalarGt<short>         , cmpScalarGe<short>         , cmpScalarLt<short>         , cmpScalarLe<short>         , cmpScalarNe<short>         },
+        {cmpScalarEq<int>           , cmpScalarGt<int>           , cmpScalarGe<int>           , cmpScalarLt<int>           , cmpScalarLe<int>           , cmpScalarNe<int>           },
+        {cmpScalarEq<float>         , cmpScalarGt<float>         , cmpScalarGe<float>         , cmpScalarLt<float>         , cmpScalarLe<float>         , cmpScalarNe<float>         },
+        {cmpScalarEq<double>        , cmpScalarGt<double>        , cmpScalarGe<double>        , cmpScalarLt<double>        , cmpScalarLe<double>        , cmpScalarNe<double>        }
     };
 
     typedef void (*cast_func_t)(Scalar& sc);
@@ -1457,235 +2045,266 @@ void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stre
         castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() <= 4);
-    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+    const int depth = src.depth();
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, src.channels()));
+    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
 
-    cast_func[src.depth()](sc);
+    cast_func[depth](sc);
 
-    funcs[src.depth()][cmpop](src, src.channels(), sc.val, dst, StreamAccessor::getStream(stream));
+    funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream));
 }
 
-
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
-
-namespace
-{
-    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-    {
-        dst.create(src.size(), src.type());
-
-        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
-    }
-
-    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu::device;
-
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>,
-            bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
-            bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>,
-            bitwiseMaskNotCaller<unsigned int>
-        };
-
-        CV_Assert(src.depth() <= CV_64F);
-        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
-
-        dst.create(src.size(), src.type());
-
-        const func_t func = funcs[src.depth()];
-
-        int cn = src.depth() != CV_64F ? src.channels() : src.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(src.rows, src.cols, cn, src, mask, dst, stream);
-    }
+    template <typename T> void bitMatNot(PtrStepSzb src, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
+void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    if (mask.empty())
-        bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
+    using namespace arithm;
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );
+
+    dst.create(src.size(), src.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = src.cols * src.elemSize();
+
+    if ((bcols & 3) == 0)
+    {
+        const int vcols = bcols >> 2;
+
+        bitMatNot<unsigned int>(
+                    PtrStepSzb(src.rows, vcols, src.data, src.step),
+                    PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
+    else if ((bcols & 1) == 0)
+    {
+        const int vcols = bcols >> 1;
+
+        bitMatNot<unsigned short>(
+                    PtrStepSzb(src.rows, vcols, src.data, src.step),
+                    PtrStepSzb(src.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
     else
-        bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
+    {
+        bitMatNot<unsigned short>(
+                    PtrStepSzb(src.rows, bcols, src.data, src.step),
+                    PtrStepSzb(src.rows, bcols, dst.data, dst.step),
+                    mask, stream);
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+    template <typename T> void bitMatAnd(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template <typename T> void bitMatOr(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+    template <typename T> void bitMatXor(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+}
 
-    template <typename T>
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-
-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-
-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
-
-    template <typename T>
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
-
-namespace
+void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    using namespace arithm;
+
+    const int depth = src1.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = src1.cols * src1.elemSize();
+
+    if ((bcols & 3) == 0)
     {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+        const int vcols = bcols >> 2;
 
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
+        bitMatAnd<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
     }
-
-    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
+    else if ((bcols & 1) == 0)
     {
-        using namespace cv::gpu::device;
+        const int vcols = bcols >> 1;
 
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>,
-            bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
-            bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>,
-            bitwiseMaskOrCaller<unsigned int>
-        };
-
-        CV_Assert(src1.depth() <= CV_64F);
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
-
-        dst.create(src1.size(), src1.type());
-
-        const func_t func = funcs[src1.depth()];
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        bitMatAnd<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
     }
-
-
-    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
+    else
     {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
 
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
-    }
-
-    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu::device;
-
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>,
-            bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
-            bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>,
-            bitwiseMaskAndCaller<unsigned int>
-        };
-
-        CV_Assert(src1.depth() <= CV_64F);
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
-
-        dst.create(src1.size(), src1.type());
-
-        const func_t func = funcs[src1.depth()];
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
-    }
-
-
-    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-
-        dst.create(src1.size(), src1.type());
-
-        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
-    }
-
-    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        using namespace cv::gpu::device;
-
-        typedef void (*func_t)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static func_t funcs[] =
-        {
-            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>,
-            bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
-            bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>,
-            bitwiseMaskXorCaller<unsigned int>
-        };
-
-        CV_Assert(src1.depth() <= CV_64F);
-        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-        CV_Assert(mask.type() == CV_8U && mask.size() == src1.size());
-
-        dst.create(src1.size(), src1.type());
-
-        const func_t func = funcs[src1.depth()];
-
-        int cn = dst.depth() != CV_64F ? dst.channels() : dst.channels() * (sizeof(double) / sizeof(unsigned int));
-
-        func(dst.rows, dst.cols, cn, src1, src2, mask, dst, stream);
+        bitMatAnd<unsigned int>(
+                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                    mask, stream);
     }
 }
 
-void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    if (mask.empty())
-        bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+    using namespace arithm;
+
+    const int depth = src1.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = src1.cols * src1.elemSize();
+
+    if ((bcols & 3) == 0)
+    {
+        const int vcols = bcols >> 2;
+
+        bitMatOr<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
+    else if ((bcols & 1) == 0)
+    {
+        const int vcols = bcols >> 1;
+
+        bitMatOr<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
     else
-        bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+    {
+
+        bitMatOr<unsigned int>(
+                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                    mask, stream);
+    }
 }
 
-void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& s)
 {
-    if (mask.empty())
-        bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+    using namespace arithm;
+
+    const int depth = src1.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.size() == src1.size() && src2.type() == src1.type() );
+    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src1.size()) );
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    const int bcols = src1.cols * src1.elemSize();
+
+    if ((bcols & 3) == 0)
+    {
+        const int vcols = bcols >> 2;
+
+        bitMatXor<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
+    else if ((bcols & 1) == 0)
+    {
+        const int vcols = bcols >> 1;
+
+        bitMatXor<unsigned int>(
+                    PtrStepSzb(src1.rows, vcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, vcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, vcols, dst.data, dst.step),
+                    mask, stream);
+    }
     else
-        bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+    {
+
+        bitMatXor<unsigned int>(
+                    PtrStepSzb(src1.rows, bcols, src1.data, src1.step),
+                    PtrStepSzb(src1.rows, bcols, src2.data, src2.step),
+                    PtrStepSzb(src1.rows, bcols, dst.data, dst.step),
+                    mask, stream);
+    }
 }
 
-void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
+//////////////////////////////////////////////////////////////////////////////
+// Binary bitwise logical operations with scalars
+
+namespace arithm
 {
-    if (mask.empty())
-        bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
-    else
-        bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+    template <typename T> void bitScalarAnd(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void bitScalarOr(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void bitScalarXor(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
 namespace
 {
+    typedef void (*bit_scalar_func_t)(PtrStepSzb src1, unsigned int src2, PtrStepSzb dst, cudaStream_t stream);
+
+    template <bit_scalar_func_t func> struct BitScalar
+    {
+        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
+        {
+            func(src, static_cast<unsigned int>(sc.val[0]), dst, stream);
+        }
+    };
+
+    template <bit_scalar_func_t func> struct BitScalar4
+    {
+        static void call(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream)
+        {
+            Scalar_<unsigned int> isc = sc;
+
+            unsigned int packedVal = 0;
+
+            packedVal |= (isc.val[0] & 0xffff);
+            packedVal |= (isc.val[1] & 0xffff) << 8;
+            packedVal |= (isc.val[2] & 0xffff) << 16;
+            packedVal |= (isc.val[3] & 0xffff) << 24;
+
+            func(src, packedVal, dst, stream);
+        }
+    };
+
     template <int DEPTH, int cn> struct NppBitwiseCFunc
     {
         typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
@@ -1739,64 +2358,79 @@ namespace
     };
 }
 
-void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
-{
-    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
-    static const func_t funcs[5][4] =
-    {
-        {NppBitwiseC<CV_8U , 1, nppiOrC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiOrC_8u_C4R >::call},
-        {0,0,0,0},
-        {NppBitwiseC<CV_16U, 1, nppiOrC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
-        {0,0,0,0},
-        {NppBitwiseC<CV_32S, 1, nppiOrC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
-    };
-
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
-
-    dst.create(src.size(), src.type());
-
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
-}
-
 void cv::gpu::bitwise_and(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
+    using namespace arithm;
+
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
     {
-        {NppBitwiseC<CV_8U , 1, nppiAndC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiAndC_8u_C4R >::call},
+        {BitScalar< bitScalarAnd<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiAndC_8u_C3R >::call, BitScalar4< bitScalarAnd<unsigned int> >::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_16U, 1, nppiAndC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
+        {BitScalar< bitScalarAnd<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiAndC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiAndC_16u_C4R>::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_32S, 1, nppiAndC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
+        {BitScalar< bitScalarAnd<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiAndC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiAndC_32s_C4R>::call}
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::bitwise_or(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
+    static const func_t funcs[5][4] =
+    {
+        {BitScalar< bitScalarOr<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiOrC_8u_C3R >::call, BitScalar4< bitScalarOr<unsigned int> >::call},
+        {0,0,0,0},
+        {BitScalar< bitScalarOr<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiOrC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiOrC_16u_C4R>::call},
+        {0,0,0,0},
+        {BitScalar< bitScalarOr<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiOrC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiOrC_32s_C4R>::call}
+    };
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
+
+    dst.create(src.size(), src.type());
+
+    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::bitwise_xor(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream)
 {
+    using namespace arithm;
+
     typedef void (*func_t)(const GpuMat& src, Scalar sc, GpuMat& dst, cudaStream_t stream);
     static const func_t funcs[5][4] =
     {
-        {NppBitwiseC<CV_8U , 1, nppiXorC_8u_C1R >::call, 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, NppBitwiseC<CV_8U , 4, nppiXorC_8u_C4R >::call},
+        {BitScalar< bitScalarXor<unsigned char> >::call , 0, NppBitwiseC<CV_8U , 3, nppiXorC_8u_C3R >::call, BitScalar4< bitScalarXor<unsigned int> >::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_16U, 1, nppiXorC_16u_C1R>::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
+        {BitScalar< bitScalarXor<unsigned short> >::call, 0, NppBitwiseC<CV_16U, 3, nppiXorC_16u_C3R>::call, NppBitwiseC<CV_16U, 4, nppiXorC_16u_C4R>::call},
         {0,0,0,0},
-        {NppBitwiseC<CV_32S, 1, nppiXorC_32s_C1R>::call, 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
+        {BitScalar< bitScalarXor<unsigned int> >::call  , 0, NppBitwiseC<CV_32S, 3, nppiXorC_32s_C3R>::call, NppBitwiseC<CV_32S, 4, nppiXorC_32s_C4R>::call}
     };
 
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32S );
+    CV_Assert( cn == 1 || cn == 3 || cn == 4 );
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()][src.channels() - 1](src, sc, dst, StreamAccessor::getStream(stream));
+    funcs[depth][cn - 1](src, sc, dst, StreamAccessor::getStream(stream));
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -1898,91 +2532,226 @@ void cv::gpu::lshift(const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& st
 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    template <typename T> void min_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void max_gpu(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void vmin4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void vmin2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void minMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 
-    template <typename T> void min_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
-    template <typename T> void max_gpu(const PtrStepSzb src, T val, PtrStepSzb dst, cudaStream_t stream);
-}}}
-
-void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
-{
-    using namespace cv::gpu::device;
-
-    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[] =
-    {
-        min_gpu<unsigned char>,
-        min_gpu<signed char>,
-        min_gpu<unsigned short>,
-        min_gpu<short>,
-        min_gpu<int>,
-        min_gpu<float>,
-        min_gpu<double>
-    };
-
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
-
-    if (src1.depth() == CV_64F)
-    {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
-            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    dst.create(src1.size(), src1.type());
-
-    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    template <typename T> void vmax4(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void vmax2(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void maxMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream)
+void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace arithm;
 
-    typedef void (*func_t)(const PtrStepSzb src1, const PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        max_gpu<unsigned char>,
-        max_gpu<signed char>,
-        max_gpu<unsigned short>,
-        max_gpu<short>,
-        max_gpu<int>,
-        max_gpu<float>,
-        max_gpu<double>
+        minMat<unsigned char>,
+        minMat<signed char>,
+        minMat<unsigned short>,
+        minMat<short>,
+        minMat<int>,
+        minMat<float>,
+        minMat<double>
+    };
+    static const func_t vfuncs4[] =
+    {
+        vmin4<unsigned int>,
+        vmin4<int>,
+        0,
+        0
+    };
+    static const func_t vfuncs2[] =
+    {
+        0,
+        0,
+        vmin2<unsigned int>,
+        vmin2<int>
     };
 
-    CV_Assert(src1.depth() <= CV_64F);
-    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    const int depth = src1.depth();
+    const int cn = src1.channels();
 
-    if (src1.depth() == CV_64F)
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src1.size(), src1.type());
 
-    funcs[src1.depth()](src1.reshape(1), src2.reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (depth < CV_32S)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const func_t vfunc4 = vfuncs4[depth];
+            const func_t vfunc2 = vfuncs2[depth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
+}
+
+void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[] =
+    {
+        maxMat<unsigned char>,
+        maxMat<signed char>,
+        maxMat<unsigned short>,
+        maxMat<short>,
+        maxMat<int>,
+        maxMat<float>,
+        maxMat<double>
+    };
+    static const func_t vfuncs4[] =
+    {
+        vmax4<unsigned int>,
+        vmax4<int>,
+        0,
+        0
+    };
+    static const func_t vfuncs2[] =
+    {
+        0,
+        0,
+        vmax2<unsigned int>,
+        vmax2<int>
+    };
+
+    const int depth = src1.depth();
+    const int cn = src1.channels();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    dst.create(src1.size(), src1.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+
+    if (depth < CV_32S)
+    {
+        const intptr_t src1ptr = reinterpret_cast<intptr_t>(src1_.data);
+        const intptr_t src2ptr = reinterpret_cast<intptr_t>(src2_.data);
+        const intptr_t dstptr = reinterpret_cast<intptr_t>(dst_.data);
+
+        const bool isAllAligned = (src1ptr & 31) == 0 && (src2ptr & 31) == 0 && (dstptr & 31) == 0;
+
+        if (deviceSupports(FEATURE_SET_COMPUTE_20) && isAllAligned)
+        {
+            const func_t vfunc4 = vfuncs4[depth];
+            const func_t vfunc2 = vfuncs2[depth];
+
+            if (vfunc4 != 0 && (src1_.cols & 3) == 0)
+            {
+                const int vcols = src1_.cols >> 2;
+
+                vfunc4(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+
+            if (vfunc2 != 0 && (src1_.cols & 1) == 0)
+            {
+                const int vcols = src1_.cols >> 1;
+
+                vfunc2(PtrStepSzb(src1_.rows, vcols, src1_.data, src1_.step),
+                       PtrStepSzb(src1_.rows, vcols, src2_.data, src2_.step),
+                       PtrStepSzb(src1_.rows, vcols, dst_.data, dst_.step),
+                       stream);
+
+                return;
+            }
+        }
+    }
+
+    const func_t func = funcs[depth];
+
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, stream);
 }
 
 namespace
 {
-    template <typename T> void minScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
+    template <typename T> double castScalar(double val)
     {
-        cv::gpu::device::min_gpu(src, saturate_cast<T>(val), dst, stream);
-    }
-
-    template <typename T> void maxScalar(const PtrStepSzb src, double val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        cv::gpu::device::max_gpu(src, saturate_cast<T>(val), dst, stream);
+        return saturate_cast<T>(val);
     }
 }
 
 void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
         minScalar<unsigned char>,
@@ -1994,23 +2763,33 @@ void cv::gpu::min(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
         minScalar<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() == 1);
-
-    if (src.depth() == CV_64F)
+    typedef double (*cast_func_t)(double sc);
+    static const cast_func_t cast_func[] =
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+    };
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
 }
 
 void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
 {
-    typedef void (*func_t)(const PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
         maxScalar<unsigned char>,
@@ -2022,45 +2801,47 @@ void cv::gpu::max(const GpuMat& src, double val, GpuMat& dst, Stream& stream)
         maxScalar<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
-    CV_Assert(src.channels() == 1);
-
-    if (src.depth() == CV_64F)
+    typedef double (*cast_func_t)(double sc);
+    static const cast_func_t cast_func[] =
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+    };
+
+    const int depth = src.depth();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( src.channels() == 1 );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src, val, dst, StreamAccessor::getStream(stream));
+    funcs[depth](src, cast_func[depth](val), dst, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
 // threshold
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T>
-    void threshold_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, T thresh, T maxVal, int type, cudaStream_t stream);
-}}}
-
-namespace
-{
-    template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
-    {
-        cv::gpu::device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
-    }
+    void threshold(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
 }
 
 double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, Stream& s)
 {
-    CV_Assert(src.channels() == 1 && src.depth() <= CV_64F);
-    CV_Assert(type <= THRESH_TOZERO_INV);
+    const int depth = src.depth();
 
-    if (src.depth() == CV_64F)
+    CV_Assert( src.channels() == 1 && depth <= CV_64F );
+    CV_Assert( type <= THRESH_TOZERO_INV );
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
@@ -2084,21 +2865,25 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
     }
     else
     {
-        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
         static const func_t funcs[] =
         {
-            threshold_caller<unsigned char>, threshold_caller<signed char>,
-            threshold_caller<unsigned short>, threshold_caller<short>,
-            threshold_caller<int>, threshold_caller<float>, threshold_caller<double>
+            arithm::threshold<unsigned char>,
+            arithm::threshold<signed char>,
+            arithm::threshold<unsigned short>,
+            arithm::threshold<short>,
+            arithm::threshold<int>,
+            arithm::threshold<float>,
+            arithm::threshold<double>
         };
 
-        if (src.depth() != CV_32F && src.depth() != CV_64F)
+        if (depth != CV_32F && depth != CV_64F)
         {
             thresh = cvFloor(thresh);
             maxVal = cvRound(maxVal);
         }
 
-        funcs[src.depth()](src, dst, thresh, maxVal, type, stream);
+        funcs[depth](src, dst, thresh, maxVal, type, stream);
     }
 
     return thresh;
@@ -2107,34 +2892,42 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 ////////////////////////////////////////////////////////////////////////
 // pow
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
-    template<typename T> void pow_caller(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
-}}}
+    template<typename T> void pow(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+}
 
 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {
-    using namespace cv::gpu::device;
-
     typedef void (*func_t)(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[] =
     {
-        pow_caller<unsigned char>,  pow_caller<signed char>,
-        pow_caller<unsigned short>, pow_caller<short>,
-        pow_caller<int>, pow_caller<float>, pow_caller<double>
+        arithm::pow<unsigned char>,
+        arithm::pow<signed char>,
+        arithm::pow<unsigned short>,
+        arithm::pow<short>,
+        arithm::pow<int>,
+        arithm::pow<float>,
+        arithm::pow<double>
     };
 
-    CV_Assert(src.depth() <= CV_64F);
+    const int depth = src.depth();
+    const int cn = src.channels();
 
-    if (src.depth() == CV_64F)
+    CV_Assert(depth <= CV_64F);
+
+    if (depth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
     dst.create(src.size(), src.type());
 
-    funcs[src.depth()](src.reshape(1), power, dst.reshape(1), StreamAccessor::getStream(stream));
+    PtrStepSzb src_(src.rows, src.cols * cn, src.data, src.step);
+    PtrStepSzb dst_(src.rows, src.cols * cn, dst.data, dst.step);
+
+    funcs[depth](src_, power, dst_, StreamAccessor::getStream(stream));
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -2200,8 +2993,8 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
         NppAlphaComp<CV_32F, nppiAlphaComp_32f_AC4R>::call
     };
 
-    CV_Assert(img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4);
-    CV_Assert(img1.size() == img2.size() && img1.type() == img2.type());
+    CV_Assert( img1.type() == CV_8UC4 || img1.type() == CV_16UC4 || img1.type() == CV_32SC4 || img1.type() == CV_32FC4 );
+    CV_Assert( img1.size() == img2.size() && img1.type() == img2.type() );
 
     dst.create(img1.size(), img1.type());
 
@@ -2213,507 +3006,508 @@ void cv::gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int
 ////////////////////////////////////////////////////////////////////////
 // addWeighted
 
-namespace cv { namespace gpu { namespace device
+namespace arithm
 {
     template <typename T1, typename T2, typename D>
-    void addWeighted_gpu(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-}}}
+    void addWeighted(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+}
 
-void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
+void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int ddepth, Stream& stream)
 {
-    using namespace cv::gpu::device;
-
-    typedef void (*func_t)(const PtrStepSzb& src1, double alpha, const PtrStepSzb& src2, double beta, double gamma, const PtrStepSzb& dst, cudaStream_t stream);
-
+    typedef void (*func_t)(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7][7] =
     {
         {
             {
-                addWeighted_gpu<unsigned char, unsigned char, unsigned char >,
-                addWeighted_gpu<unsigned char, unsigned char, signed char >,
-                addWeighted_gpu<unsigned char, unsigned char, unsigned short>,
-                addWeighted_gpu<unsigned char, unsigned char, short >,
-                addWeighted_gpu<unsigned char, unsigned char, int   >,
-                addWeighted_gpu<unsigned char, unsigned char, float >,
-                addWeighted_gpu<unsigned char, unsigned char, double>
+                arithm::addWeighted<unsigned char, unsigned char, unsigned char >,
+                arithm::addWeighted<unsigned char, unsigned char, signed char >,
+                arithm::addWeighted<unsigned char, unsigned char, unsigned short>,
+                arithm::addWeighted<unsigned char, unsigned char, short >,
+                arithm::addWeighted<unsigned char, unsigned char, int   >,
+                arithm::addWeighted<unsigned char, unsigned char, float >,
+                arithm::addWeighted<unsigned char, unsigned char, double>
             },
             {
-                addWeighted_gpu<unsigned char, signed char, unsigned char >,
-                addWeighted_gpu<unsigned char, signed char, signed char >,
-                addWeighted_gpu<unsigned char, signed char, unsigned short>,
-                addWeighted_gpu<unsigned char, signed char, short >,
-                addWeighted_gpu<unsigned char, signed char, int   >,
-                addWeighted_gpu<unsigned char, signed char, float >,
-                addWeighted_gpu<unsigned char, signed char, double>
+                arithm::addWeighted<unsigned char, signed char, unsigned char >,
+                arithm::addWeighted<unsigned char, signed char, signed char >,
+                arithm::addWeighted<unsigned char, signed char, unsigned short>,
+                arithm::addWeighted<unsigned char, signed char, short >,
+                arithm::addWeighted<unsigned char, signed char, int   >,
+                arithm::addWeighted<unsigned char, signed char, float >,
+                arithm::addWeighted<unsigned char, signed char, double>
             },
             {
-                addWeighted_gpu<unsigned char, unsigned short, unsigned char >,
-                addWeighted_gpu<unsigned char, unsigned short, signed char >,
-                addWeighted_gpu<unsigned char, unsigned short, unsigned short>,
-                addWeighted_gpu<unsigned char, unsigned short, short >,
-                addWeighted_gpu<unsigned char, unsigned short, int   >,
-                addWeighted_gpu<unsigned char, unsigned short, float >,
-                addWeighted_gpu<unsigned char, unsigned short, double>
+                arithm::addWeighted<unsigned char, unsigned short, unsigned char >,
+                arithm::addWeighted<unsigned char, unsigned short, signed char >,
+                arithm::addWeighted<unsigned char, unsigned short, unsigned short>,
+                arithm::addWeighted<unsigned char, unsigned short, short >,
+                arithm::addWeighted<unsigned char, unsigned short, int   >,
+                arithm::addWeighted<unsigned char, unsigned short, float >,
+                arithm::addWeighted<unsigned char, unsigned short, double>
             },
             {
-                addWeighted_gpu<unsigned char, short, unsigned char >,
-                addWeighted_gpu<unsigned char, short, signed char >,
-                addWeighted_gpu<unsigned char, short, unsigned short>,
-                addWeighted_gpu<unsigned char, short, short >,
-                addWeighted_gpu<unsigned char, short, int   >,
-                addWeighted_gpu<unsigned char, short, float >,
-                addWeighted_gpu<unsigned char, short, double>
+                arithm::addWeighted<unsigned char, short, unsigned char >,
+                arithm::addWeighted<unsigned char, short, signed char >,
+                arithm::addWeighted<unsigned char, short, unsigned short>,
+                arithm::addWeighted<unsigned char, short, short >,
+                arithm::addWeighted<unsigned char, short, int   >,
+                arithm::addWeighted<unsigned char, short, float >,
+                arithm::addWeighted<unsigned char, short, double>
             },
             {
-                addWeighted_gpu<unsigned char, int, unsigned char >,
-                addWeighted_gpu<unsigned char, int, signed char >,
-                addWeighted_gpu<unsigned char, int, unsigned short>,
-                addWeighted_gpu<unsigned char, int, short >,
-                addWeighted_gpu<unsigned char, int, int   >,
-                addWeighted_gpu<unsigned char, int, float >,
-                addWeighted_gpu<unsigned char, int, double>
+                arithm::addWeighted<unsigned char, int, unsigned char >,
+                arithm::addWeighted<unsigned char, int, signed char >,
+                arithm::addWeighted<unsigned char, int, unsigned short>,
+                arithm::addWeighted<unsigned char, int, short >,
+                arithm::addWeighted<unsigned char, int, int   >,
+                arithm::addWeighted<unsigned char, int, float >,
+                arithm::addWeighted<unsigned char, int, double>
             },
             {
-                addWeighted_gpu<unsigned char, float, unsigned char >,
-                addWeighted_gpu<unsigned char, float, signed char >,
-                addWeighted_gpu<unsigned char, float, unsigned short>,
-                addWeighted_gpu<unsigned char, float, short >,
-                addWeighted_gpu<unsigned char, float, int   >,
-                addWeighted_gpu<unsigned char, float, float >,
-                addWeighted_gpu<unsigned char, float, double>
+                arithm::addWeighted<unsigned char, float, unsigned char >,
+                arithm::addWeighted<unsigned char, float, signed char >,
+                arithm::addWeighted<unsigned char, float, unsigned short>,
+                arithm::addWeighted<unsigned char, float, short >,
+                arithm::addWeighted<unsigned char, float, int   >,
+                arithm::addWeighted<unsigned char, float, float >,
+                arithm::addWeighted<unsigned char, float, double>
             },
             {
-                addWeighted_gpu<unsigned char, double, unsigned char >,
-                addWeighted_gpu<unsigned char, double, signed char >,
-                addWeighted_gpu<unsigned char, double, unsigned short>,
-                addWeighted_gpu<unsigned char, double, short >,
-                addWeighted_gpu<unsigned char, double, int   >,
-                addWeighted_gpu<unsigned char, double, float >,
-                addWeighted_gpu<unsigned char, double, double>
+                arithm::addWeighted<unsigned char, double, unsigned char >,
+                arithm::addWeighted<unsigned char, double, signed char >,
+                arithm::addWeighted<unsigned char, double, unsigned short>,
+                arithm::addWeighted<unsigned char, double, short >,
+                arithm::addWeighted<unsigned char, double, int   >,
+                arithm::addWeighted<unsigned char, double, float >,
+                arithm::addWeighted<unsigned char, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<signed char, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<signed char, unsigned char, short >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, int   >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, float >*/,
-                0/*addWeighted_gpu<signed char, unsigned char, double>*/
+                0/*arithm::addWeighted<signed char, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<signed char, unsigned char, short >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, int   >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, float >*/,
+                0/*arithm::addWeighted<signed char, unsigned char, double>*/
             },
             {
-                addWeighted_gpu<signed char, signed char, unsigned char >,
-                addWeighted_gpu<signed char, signed char, signed char >,
-                addWeighted_gpu<signed char, signed char, unsigned short>,
-                addWeighted_gpu<signed char, signed char, short >,
-                addWeighted_gpu<signed char, signed char, int   >,
-                addWeighted_gpu<signed char, signed char, float >,
-                addWeighted_gpu<signed char, signed char, double>
+                arithm::addWeighted<signed char, signed char, unsigned char >,
+                arithm::addWeighted<signed char, signed char, signed char >,
+                arithm::addWeighted<signed char, signed char, unsigned short>,
+                arithm::addWeighted<signed char, signed char, short >,
+                arithm::addWeighted<signed char, signed char, int   >,
+                arithm::addWeighted<signed char, signed char, float >,
+                arithm::addWeighted<signed char, signed char, double>
             },
             {
-                addWeighted_gpu<signed char, unsigned short, unsigned char >,
-                addWeighted_gpu<signed char, unsigned short, signed char >,
-                addWeighted_gpu<signed char, unsigned short, unsigned short>,
-                addWeighted_gpu<signed char, unsigned short, short >,
-                addWeighted_gpu<signed char, unsigned short, int   >,
-                addWeighted_gpu<signed char, unsigned short, float >,
-                addWeighted_gpu<signed char, unsigned short, double>
+                arithm::addWeighted<signed char, unsigned short, unsigned char >,
+                arithm::addWeighted<signed char, unsigned short, signed char >,
+                arithm::addWeighted<signed char, unsigned short, unsigned short>,
+                arithm::addWeighted<signed char, unsigned short, short >,
+                arithm::addWeighted<signed char, unsigned short, int   >,
+                arithm::addWeighted<signed char, unsigned short, float >,
+                arithm::addWeighted<signed char, unsigned short, double>
             },
             {
-                addWeighted_gpu<signed char, short, unsigned char >,
-                addWeighted_gpu<signed char, short, signed char >,
-                addWeighted_gpu<signed char, short, unsigned short>,
-                addWeighted_gpu<signed char, short, short >,
-                addWeighted_gpu<signed char, short, int   >,
-                addWeighted_gpu<signed char, short, float >,
-                addWeighted_gpu<signed char, short, double>
+                arithm::addWeighted<signed char, short, unsigned char >,
+                arithm::addWeighted<signed char, short, signed char >,
+                arithm::addWeighted<signed char, short, unsigned short>,
+                arithm::addWeighted<signed char, short, short >,
+                arithm::addWeighted<signed char, short, int   >,
+                arithm::addWeighted<signed char, short, float >,
+                arithm::addWeighted<signed char, short, double>
             },
             {
-                addWeighted_gpu<signed char, int, unsigned char >,
-                addWeighted_gpu<signed char, int, signed char >,
-                addWeighted_gpu<signed char, int, unsigned short>,
-                addWeighted_gpu<signed char, int, short >,
-                addWeighted_gpu<signed char, int, int   >,
-                addWeighted_gpu<signed char, int, float >,
-                addWeighted_gpu<signed char, int, double>
+                arithm::addWeighted<signed char, int, unsigned char >,
+                arithm::addWeighted<signed char, int, signed char >,
+                arithm::addWeighted<signed char, int, unsigned short>,
+                arithm::addWeighted<signed char, int, short >,
+                arithm::addWeighted<signed char, int, int   >,
+                arithm::addWeighted<signed char, int, float >,
+                arithm::addWeighted<signed char, int, double>
             },
             {
-                addWeighted_gpu<signed char, float, unsigned char >,
-                addWeighted_gpu<signed char, float, signed char >,
-                addWeighted_gpu<signed char, float, unsigned short>,
-                addWeighted_gpu<signed char, float, short >,
-                addWeighted_gpu<signed char, float, int   >,
-                addWeighted_gpu<signed char, float, float >,
-                addWeighted_gpu<signed char, float, double>
+                arithm::addWeighted<signed char, float, unsigned char >,
+                arithm::addWeighted<signed char, float, signed char >,
+                arithm::addWeighted<signed char, float, unsigned short>,
+                arithm::addWeighted<signed char, float, short >,
+                arithm::addWeighted<signed char, float, int   >,
+                arithm::addWeighted<signed char, float, float >,
+                arithm::addWeighted<signed char, float, double>
             },
             {
-                addWeighted_gpu<signed char, double, unsigned char >,
-                addWeighted_gpu<signed char, double, signed char >,
-                addWeighted_gpu<signed char, double, unsigned short>,
-                addWeighted_gpu<signed char, double, short >,
-                addWeighted_gpu<signed char, double, int   >,
-                addWeighted_gpu<signed char, double, float >,
-                addWeighted_gpu<signed char, double, double>
+                arithm::addWeighted<signed char, double, unsigned char >,
+                arithm::addWeighted<signed char, double, signed char >,
+                arithm::addWeighted<signed char, double, unsigned short>,
+                arithm::addWeighted<signed char, double, short >,
+                arithm::addWeighted<signed char, double, int   >,
+                arithm::addWeighted<signed char, double, float >,
+                arithm::addWeighted<signed char, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<unsigned short, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, short >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, int   >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, float >*/,
-                0/*addWeighted_gpu<unsigned short, unsigned char, double>*/
+                0/*arithm::addWeighted<unsigned short, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, short >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, int   >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, float >*/,
+                0/*arithm::addWeighted<unsigned short, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<unsigned short, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, signed char >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<unsigned short, signed char, short >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, int   >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, float >*/,
-                0/*addWeighted_gpu<unsigned short, signed char, double>*/
+                0/*arithm::addWeighted<unsigned short, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, signed char >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<unsigned short, signed char, short >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, int   >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, float >*/,
+                0/*arithm::addWeighted<unsigned short, signed char, double>*/
             },
             {
-                addWeighted_gpu<unsigned short, unsigned short, unsigned char >,
-                addWeighted_gpu<unsigned short, unsigned short, signed char >,
-                addWeighted_gpu<unsigned short, unsigned short, unsigned short>,
-                addWeighted_gpu<unsigned short, unsigned short, short >,
-                addWeighted_gpu<unsigned short, unsigned short, int   >,
-                addWeighted_gpu<unsigned short, unsigned short, float >,
-                addWeighted_gpu<unsigned short, unsigned short, double>
+                arithm::addWeighted<unsigned short, unsigned short, unsigned char >,
+                arithm::addWeighted<unsigned short, unsigned short, signed char >,
+                arithm::addWeighted<unsigned short, unsigned short, unsigned short>,
+                arithm::addWeighted<unsigned short, unsigned short, short >,
+                arithm::addWeighted<unsigned short, unsigned short, int   >,
+                arithm::addWeighted<unsigned short, unsigned short, float >,
+                arithm::addWeighted<unsigned short, unsigned short, double>
             },
             {
-                addWeighted_gpu<unsigned short, short, unsigned char >,
-                addWeighted_gpu<unsigned short, short, signed char >,
-                addWeighted_gpu<unsigned short, short, unsigned short>,
-                addWeighted_gpu<unsigned short, short, short >,
-                addWeighted_gpu<unsigned short, short, int   >,
-                addWeighted_gpu<unsigned short, short, float >,
-                addWeighted_gpu<unsigned short, short, double>
+                arithm::addWeighted<unsigned short, short, unsigned char >,
+                arithm::addWeighted<unsigned short, short, signed char >,
+                arithm::addWeighted<unsigned short, short, unsigned short>,
+                arithm::addWeighted<unsigned short, short, short >,
+                arithm::addWeighted<unsigned short, short, int   >,
+                arithm::addWeighted<unsigned short, short, float >,
+                arithm::addWeighted<unsigned short, short, double>
             },
             {
-                addWeighted_gpu<unsigned short, int, unsigned char >,
-                addWeighted_gpu<unsigned short, int, signed char >,
-                addWeighted_gpu<unsigned short, int, unsigned short>,
-                addWeighted_gpu<unsigned short, int, short >,
-                addWeighted_gpu<unsigned short, int, int   >,
-                addWeighted_gpu<unsigned short, int, float >,
-                addWeighted_gpu<unsigned short, int, double>
+                arithm::addWeighted<unsigned short, int, unsigned char >,
+                arithm::addWeighted<unsigned short, int, signed char >,
+                arithm::addWeighted<unsigned short, int, unsigned short>,
+                arithm::addWeighted<unsigned short, int, short >,
+                arithm::addWeighted<unsigned short, int, int   >,
+                arithm::addWeighted<unsigned short, int, float >,
+                arithm::addWeighted<unsigned short, int, double>
             },
             {
-                addWeighted_gpu<unsigned short, float, unsigned char >,
-                addWeighted_gpu<unsigned short, float, signed char >,
-                addWeighted_gpu<unsigned short, float, unsigned short>,
-                addWeighted_gpu<unsigned short, float, short >,
-                addWeighted_gpu<unsigned short, float, int   >,
-                addWeighted_gpu<unsigned short, float, float >,
-                addWeighted_gpu<unsigned short, float, double>
+                arithm::addWeighted<unsigned short, float, unsigned char >,
+                arithm::addWeighted<unsigned short, float, signed char >,
+                arithm::addWeighted<unsigned short, float, unsigned short>,
+                arithm::addWeighted<unsigned short, float, short >,
+                arithm::addWeighted<unsigned short, float, int   >,
+                arithm::addWeighted<unsigned short, float, float >,
+                arithm::addWeighted<unsigned short, float, double>
             },
             {
-                addWeighted_gpu<unsigned short, double, unsigned char >,
-                addWeighted_gpu<unsigned short, double, signed char >,
-                addWeighted_gpu<unsigned short, double, unsigned short>,
-                addWeighted_gpu<unsigned short, double, short >,
-                addWeighted_gpu<unsigned short, double, int   >,
-                addWeighted_gpu<unsigned short, double, float >,
-                addWeighted_gpu<unsigned short, double, double>
+                arithm::addWeighted<unsigned short, double, unsigned char >,
+                arithm::addWeighted<unsigned short, double, signed char >,
+                arithm::addWeighted<unsigned short, double, unsigned short>,
+                arithm::addWeighted<unsigned short, double, short >,
+                arithm::addWeighted<unsigned short, double, int   >,
+                arithm::addWeighted<unsigned short, double, float >,
+                arithm::addWeighted<unsigned short, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<short, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<short, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<short, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<short, unsigned char, short >*/,
-                0/*addWeighted_gpu<short, unsigned char, int   >*/,
-                0/*addWeighted_gpu<short, unsigned char, float >*/,
-                0/*addWeighted_gpu<short, unsigned char, double>*/
+                0/*arithm::addWeighted<short, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<short, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<short, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<short, unsigned char, short >*/,
+                0/*arithm::addWeighted<short, unsigned char, int   >*/,
+                0/*arithm::addWeighted<short, unsigned char, float >*/,
+                0/*arithm::addWeighted<short, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<short, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<short, signed char, signed char >*/,
-                0/*addWeighted_gpu<short, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<short, signed char, short >*/,
-                0/*addWeighted_gpu<short, signed char, int   >*/,
-                0/*addWeighted_gpu<short, signed char, float >*/,
-                0/*addWeighted_gpu<short, signed char, double>*/
+                0/*arithm::addWeighted<short, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<short, signed char, signed char >*/,
+                0/*arithm::addWeighted<short, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<short, signed char, short >*/,
+                0/*arithm::addWeighted<short, signed char, int   >*/,
+                0/*arithm::addWeighted<short, signed char, float >*/,
+                0/*arithm::addWeighted<short, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<short, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<short, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<short, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<short, unsigned short, short >*/,
-                0/*addWeighted_gpu<short, unsigned short, int   >*/,
-                0/*addWeighted_gpu<short, unsigned short, float >*/,
-                0/*addWeighted_gpu<short, unsigned short, double>*/
+                0/*arithm::addWeighted<short, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<short, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<short, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<short, unsigned short, short >*/,
+                0/*arithm::addWeighted<short, unsigned short, int   >*/,
+                0/*arithm::addWeighted<short, unsigned short, float >*/,
+                0/*arithm::addWeighted<short, unsigned short, double>*/
             },
             {
-                addWeighted_gpu<short, short, unsigned char >,
-                addWeighted_gpu<short, short, signed char >,
-                addWeighted_gpu<short, short, unsigned short>,
-                addWeighted_gpu<short, short, short >,
-                addWeighted_gpu<short, short, int   >,
-                addWeighted_gpu<short, short, float >,
-                addWeighted_gpu<short, short, double>
+                arithm::addWeighted<short, short, unsigned char >,
+                arithm::addWeighted<short, short, signed char >,
+                arithm::addWeighted<short, short, unsigned short>,
+                arithm::addWeighted<short, short, short >,
+                arithm::addWeighted<short, short, int   >,
+                arithm::addWeighted<short, short, float >,
+                arithm::addWeighted<short, short, double>
             },
             {
-                addWeighted_gpu<short, int, unsigned char >,
-                addWeighted_gpu<short, int, signed char >,
-                addWeighted_gpu<short, int, unsigned short>,
-                addWeighted_gpu<short, int, short >,
-                addWeighted_gpu<short, int, int   >,
-                addWeighted_gpu<short, int, float >,
-                addWeighted_gpu<short, int, double>
+                arithm::addWeighted<short, int, unsigned char >,
+                arithm::addWeighted<short, int, signed char >,
+                arithm::addWeighted<short, int, unsigned short>,
+                arithm::addWeighted<short, int, short >,
+                arithm::addWeighted<short, int, int   >,
+                arithm::addWeighted<short, int, float >,
+                arithm::addWeighted<short, int, double>
             },
             {
-                addWeighted_gpu<short, float, unsigned char >,
-                addWeighted_gpu<short, float, signed char >,
-                addWeighted_gpu<short, float, unsigned short>,
-                addWeighted_gpu<short, float, short >,
-                addWeighted_gpu<short, float, int   >,
-                addWeighted_gpu<short, float, float >,
-                addWeighted_gpu<short, float, double>
+                arithm::addWeighted<short, float, unsigned char >,
+                arithm::addWeighted<short, float, signed char >,
+                arithm::addWeighted<short, float, unsigned short>,
+                arithm::addWeighted<short, float, short >,
+                arithm::addWeighted<short, float, int   >,
+                arithm::addWeighted<short, float, float >,
+                arithm::addWeighted<short, float, double>
             },
             {
-                addWeighted_gpu<short, double, unsigned char >,
-                addWeighted_gpu<short, double, signed char >,
-                addWeighted_gpu<short, double, unsigned short>,
-                addWeighted_gpu<short, double, short >,
-                addWeighted_gpu<short, double, int   >,
-                addWeighted_gpu<short, double, float >,
-                addWeighted_gpu<short, double, double>
+                arithm::addWeighted<short, double, unsigned char >,
+                arithm::addWeighted<short, double, signed char >,
+                arithm::addWeighted<short, double, unsigned short>,
+                arithm::addWeighted<short, double, short >,
+                arithm::addWeighted<short, double, int   >,
+                arithm::addWeighted<short, double, float >,
+                arithm::addWeighted<short, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<int, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<int, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<int, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<int, unsigned char, short >*/,
-                0/*addWeighted_gpu<int, unsigned char, int   >*/,
-                0/*addWeighted_gpu<int, unsigned char, float >*/,
-                0/*addWeighted_gpu<int, unsigned char, double>*/
+                0/*arithm::addWeighted<int, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<int, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<int, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<int, unsigned char, short >*/,
+                0/*arithm::addWeighted<int, unsigned char, int   >*/,
+                0/*arithm::addWeighted<int, unsigned char, float >*/,
+                0/*arithm::addWeighted<int, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<int, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<int, signed char, signed char >*/,
-                0/*addWeighted_gpu<int, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<int, signed char, short >*/,
-                0/*addWeighted_gpu<int, signed char, int   >*/,
-                0/*addWeighted_gpu<int, signed char, float >*/,
-                0/*addWeighted_gpu<int, signed char, double>*/
+                0/*arithm::addWeighted<int, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<int, signed char, signed char >*/,
+                0/*arithm::addWeighted<int, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<int, signed char, short >*/,
+                0/*arithm::addWeighted<int, signed char, int   >*/,
+                0/*arithm::addWeighted<int, signed char, float >*/,
+                0/*arithm::addWeighted<int, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<int, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<int, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<int, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<int, unsigned short, short >*/,
-                0/*addWeighted_gpu<int, unsigned short, int   >*/,
-                0/*addWeighted_gpu<int, unsigned short, float >*/,
-                0/*addWeighted_gpu<int, unsigned short, double>*/
+                0/*arithm::addWeighted<int, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<int, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<int, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<int, unsigned short, short >*/,
+                0/*arithm::addWeighted<int, unsigned short, int   >*/,
+                0/*arithm::addWeighted<int, unsigned short, float >*/,
+                0/*arithm::addWeighted<int, unsigned short, double>*/
             },
             {
-                0/*addWeighted_gpu<int, short, unsigned char >*/,
-                0/*addWeighted_gpu<int, short, signed char >*/,
-                0/*addWeighted_gpu<int, short, unsigned short>*/,
-                0/*addWeighted_gpu<int, short, short >*/,
-                0/*addWeighted_gpu<int, short, int   >*/,
-                0/*addWeighted_gpu<int, short, float >*/,
-                0/*addWeighted_gpu<int, short, double>*/
+                0/*arithm::addWeighted<int, short, unsigned char >*/,
+                0/*arithm::addWeighted<int, short, signed char >*/,
+                0/*arithm::addWeighted<int, short, unsigned short>*/,
+                0/*arithm::addWeighted<int, short, short >*/,
+                0/*arithm::addWeighted<int, short, int   >*/,
+                0/*arithm::addWeighted<int, short, float >*/,
+                0/*arithm::addWeighted<int, short, double>*/
             },
             {
-                addWeighted_gpu<int, int, unsigned char >,
-                addWeighted_gpu<int, int, signed char >,
-                addWeighted_gpu<int, int, unsigned short>,
-                addWeighted_gpu<int, int, short >,
-                addWeighted_gpu<int, int, int   >,
-                addWeighted_gpu<int, int, float >,
-                addWeighted_gpu<int, int, double>
+                arithm::addWeighted<int, int, unsigned char >,
+                arithm::addWeighted<int, int, signed char >,
+                arithm::addWeighted<int, int, unsigned short>,
+                arithm::addWeighted<int, int, short >,
+                arithm::addWeighted<int, int, int   >,
+                arithm::addWeighted<int, int, float >,
+                arithm::addWeighted<int, int, double>
             },
             {
-                addWeighted_gpu<int, float, unsigned char >,
-                addWeighted_gpu<int, float, signed char >,
-                addWeighted_gpu<int, float, unsigned short>,
-                addWeighted_gpu<int, float, short >,
-                addWeighted_gpu<int, float, int   >,
-                addWeighted_gpu<int, float, float >,
-                addWeighted_gpu<int, float, double>
+                arithm::addWeighted<int, float, unsigned char >,
+                arithm::addWeighted<int, float, signed char >,
+                arithm::addWeighted<int, float, unsigned short>,
+                arithm::addWeighted<int, float, short >,
+                arithm::addWeighted<int, float, int   >,
+                arithm::addWeighted<int, float, float >,
+                arithm::addWeighted<int, float, double>
             },
             {
-                addWeighted_gpu<int, double, unsigned char >,
-                addWeighted_gpu<int, double, signed char >,
-                addWeighted_gpu<int, double, unsigned short>,
-                addWeighted_gpu<int, double, short >,
-                addWeighted_gpu<int, double, int   >,
-                addWeighted_gpu<int, double, float >,
-                addWeighted_gpu<int, double, double>
+                arithm::addWeighted<int, double, unsigned char >,
+                arithm::addWeighted<int, double, signed char >,
+                arithm::addWeighted<int, double, unsigned short>,
+                arithm::addWeighted<int, double, short >,
+                arithm::addWeighted<int, double, int   >,
+                arithm::addWeighted<int, double, float >,
+                arithm::addWeighted<int, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<float, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<float, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<float, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<float, unsigned char, short >*/,
-                0/*addWeighted_gpu<float, unsigned char, int   >*/,
-                0/*addWeighted_gpu<float, unsigned char, float >*/,
-                0/*addWeighted_gpu<float, unsigned char, double>*/
+                0/*arithm::addWeighted<float, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<float, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<float, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<float, unsigned char, short >*/,
+                0/*arithm::addWeighted<float, unsigned char, int   >*/,
+                0/*arithm::addWeighted<float, unsigned char, float >*/,
+                0/*arithm::addWeighted<float, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<float, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<float, signed char, signed char >*/,
-                0/*addWeighted_gpu<float, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<float, signed char, short >*/,
-                0/*addWeighted_gpu<float, signed char, int   >*/,
-                0/*addWeighted_gpu<float, signed char, float >*/,
-                0/*addWeighted_gpu<float, signed char, double>*/
+                0/*arithm::addWeighted<float, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<float, signed char, signed char >*/,
+                0/*arithm::addWeighted<float, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<float, signed char, short >*/,
+                0/*arithm::addWeighted<float, signed char, int   >*/,
+                0/*arithm::addWeighted<float, signed char, float >*/,
+                0/*arithm::addWeighted<float, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<float, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<float, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<float, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<float, unsigned short, short >*/,
-                0/*addWeighted_gpu<float, unsigned short, int   >*/,
-                0/*addWeighted_gpu<float, unsigned short, float >*/,
-                0/*addWeighted_gpu<float, unsigned short, double>*/
+                0/*arithm::addWeighted<float, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<float, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<float, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<float, unsigned short, short >*/,
+                0/*arithm::addWeighted<float, unsigned short, int   >*/,
+                0/*arithm::addWeighted<float, unsigned short, float >*/,
+                0/*arithm::addWeighted<float, unsigned short, double>*/
             },
             {
-                0/*addWeighted_gpu<float, short, unsigned char >*/,
-                0/*addWeighted_gpu<float, short, signed char >*/,
-                0/*addWeighted_gpu<float, short, unsigned short>*/,
-                0/*addWeighted_gpu<float, short, short >*/,
-                0/*addWeighted_gpu<float, short, int   >*/,
-                0/*addWeighted_gpu<float, short, float >*/,
-                0/*addWeighted_gpu<float, short, double>*/
+                0/*arithm::addWeighted<float, short, unsigned char >*/,
+                0/*arithm::addWeighted<float, short, signed char >*/,
+                0/*arithm::addWeighted<float, short, unsigned short>*/,
+                0/*arithm::addWeighted<float, short, short >*/,
+                0/*arithm::addWeighted<float, short, int   >*/,
+                0/*arithm::addWeighted<float, short, float >*/,
+                0/*arithm::addWeighted<float, short, double>*/
             },
             {
-                0/*addWeighted_gpu<float, int, unsigned char >*/,
-                0/*addWeighted_gpu<float, int, signed char >*/,
-                0/*addWeighted_gpu<float, int, unsigned short>*/,
-                0/*addWeighted_gpu<float, int, short >*/,
-                0/*addWeighted_gpu<float, int, int   >*/,
-                0/*addWeighted_gpu<float, int, float >*/,
-                0/*addWeighted_gpu<float, int, double>*/
+                0/*arithm::addWeighted<float, int, unsigned char >*/,
+                0/*arithm::addWeighted<float, int, signed char >*/,
+                0/*arithm::addWeighted<float, int, unsigned short>*/,
+                0/*arithm::addWeighted<float, int, short >*/,
+                0/*arithm::addWeighted<float, int, int   >*/,
+                0/*arithm::addWeighted<float, int, float >*/,
+                0/*arithm::addWeighted<float, int, double>*/
             },
             {
-                addWeighted_gpu<float, float, unsigned char >,
-                addWeighted_gpu<float, float, signed char >,
-                addWeighted_gpu<float, float, unsigned short>,
-                addWeighted_gpu<float, float, short >,
-                addWeighted_gpu<float, float, int   >,
-                addWeighted_gpu<float, float, float >,
-                addWeighted_gpu<float, float, double>
+                arithm::addWeighted<float, float, unsigned char >,
+                arithm::addWeighted<float, float, signed char >,
+                arithm::addWeighted<float, float, unsigned short>,
+                arithm::addWeighted<float, float, short >,
+                arithm::addWeighted<float, float, int   >,
+                arithm::addWeighted<float, float, float >,
+                arithm::addWeighted<float, float, double>
             },
             {
-                addWeighted_gpu<float, double, unsigned char >,
-                addWeighted_gpu<float, double, signed char >,
-                addWeighted_gpu<float, double, unsigned short>,
-                addWeighted_gpu<float, double, short >,
-                addWeighted_gpu<float, double, int   >,
-                addWeighted_gpu<float, double, float >,
-                addWeighted_gpu<float, double, double>
+                arithm::addWeighted<float, double, unsigned char >,
+                arithm::addWeighted<float, double, signed char >,
+                arithm::addWeighted<float, double, unsigned short>,
+                arithm::addWeighted<float, double, short >,
+                arithm::addWeighted<float, double, int   >,
+                arithm::addWeighted<float, double, float >,
+                arithm::addWeighted<float, double, double>
             }
         },
         {
             {
-                0/*addWeighted_gpu<double, unsigned char, unsigned char >*/,
-                0/*addWeighted_gpu<double, unsigned char, signed char >*/,
-                0/*addWeighted_gpu<double, unsigned char, unsigned short>*/,
-                0/*addWeighted_gpu<double, unsigned char, short >*/,
-                0/*addWeighted_gpu<double, unsigned char, int   >*/,
-                0/*addWeighted_gpu<double, unsigned char, float >*/,
-                0/*addWeighted_gpu<double, unsigned char, double>*/
+                0/*arithm::addWeighted<double, unsigned char, unsigned char >*/,
+                0/*arithm::addWeighted<double, unsigned char, signed char >*/,
+                0/*arithm::addWeighted<double, unsigned char, unsigned short>*/,
+                0/*arithm::addWeighted<double, unsigned char, short >*/,
+                0/*arithm::addWeighted<double, unsigned char, int   >*/,
+                0/*arithm::addWeighted<double, unsigned char, float >*/,
+                0/*arithm::addWeighted<double, unsigned char, double>*/
             },
             {
-                0/*addWeighted_gpu<double, signed char, unsigned char >*/,
-                0/*addWeighted_gpu<double, signed char, signed char >*/,
-                0/*addWeighted_gpu<double, signed char, unsigned short>*/,
-                0/*addWeighted_gpu<double, signed char, short >*/,
-                0/*addWeighted_gpu<double, signed char, int   >*/,
-                0/*addWeighted_gpu<double, signed char, float >*/,
-                0/*addWeighted_gpu<double, signed char, double>*/
+                0/*arithm::addWeighted<double, signed char, unsigned char >*/,
+                0/*arithm::addWeighted<double, signed char, signed char >*/,
+                0/*arithm::addWeighted<double, signed char, unsigned short>*/,
+                0/*arithm::addWeighted<double, signed char, short >*/,
+                0/*arithm::addWeighted<double, signed char, int   >*/,
+                0/*arithm::addWeighted<double, signed char, float >*/,
+                0/*arithm::addWeighted<double, signed char, double>*/
             },
             {
-                0/*addWeighted_gpu<double, unsigned short, unsigned char >*/,
-                0/*addWeighted_gpu<double, unsigned short, signed char >*/,
-                0/*addWeighted_gpu<double, unsigned short, unsigned short>*/,
-                0/*addWeighted_gpu<double, unsigned short, short >*/,
-                0/*addWeighted_gpu<double, unsigned short, int   >*/,
-                0/*addWeighted_gpu<double, unsigned short, float >*/,
-                0/*addWeighted_gpu<double, unsigned short, double>*/
+                0/*arithm::addWeighted<double, unsigned short, unsigned char >*/,
+                0/*arithm::addWeighted<double, unsigned short, signed char >*/,
+                0/*arithm::addWeighted<double, unsigned short, unsigned short>*/,
+                0/*arithm::addWeighted<double, unsigned short, short >*/,
+                0/*arithm::addWeighted<double, unsigned short, int   >*/,
+                0/*arithm::addWeighted<double, unsigned short, float >*/,
+                0/*arithm::addWeighted<double, unsigned short, double>*/
             },
             {
-                0/*addWeighted_gpu<double, short, unsigned char >*/,
-                0/*addWeighted_gpu<double, short, signed char >*/,
-                0/*addWeighted_gpu<double, short, unsigned short>*/,
-                0/*addWeighted_gpu<double, short, short >*/,
-                0/*addWeighted_gpu<double, short, int   >*/,
-                0/*addWeighted_gpu<double, short, float >*/,
-                0/*addWeighted_gpu<double, short, double>*/
+                0/*arithm::addWeighted<double, short, unsigned char >*/,
+                0/*arithm::addWeighted<double, short, signed char >*/,
+                0/*arithm::addWeighted<double, short, unsigned short>*/,
+                0/*arithm::addWeighted<double, short, short >*/,
+                0/*arithm::addWeighted<double, short, int   >*/,
+                0/*arithm::addWeighted<double, short, float >*/,
+                0/*arithm::addWeighted<double, short, double>*/
             },
             {
-                0/*addWeighted_gpu<double, int, unsigned char >*/,
-                0/*addWeighted_gpu<double, int, signed char >*/,
-                0/*addWeighted_gpu<double, int, unsigned short>*/,
-                0/*addWeighted_gpu<double, int, short >*/,
-                0/*addWeighted_gpu<double, int, int   >*/,
-                0/*addWeighted_gpu<double, int, float >*/,
-                0/*addWeighted_gpu<double, int, double>*/
+                0/*arithm::addWeighted<double, int, unsigned char >*/,
+                0/*arithm::addWeighted<double, int, signed char >*/,
+                0/*arithm::addWeighted<double, int, unsigned short>*/,
+                0/*arithm::addWeighted<double, int, short >*/,
+                0/*arithm::addWeighted<double, int, int   >*/,
+                0/*arithm::addWeighted<double, int, float >*/,
+                0/*arithm::addWeighted<double, int, double>*/
             },
             {
-                0/*addWeighted_gpu<double, float, unsigned char >*/,
-                0/*addWeighted_gpu<double, float, signed char >*/,
-                0/*addWeighted_gpu<double, float, unsigned short>*/,
-                0/*addWeighted_gpu<double, float, short >*/,
-                0/*addWeighted_gpu<double, float, int   >*/,
-                0/*addWeighted_gpu<double, float, float >*/,
-                0/*addWeighted_gpu<double, float, double>*/
+                0/*arithm::addWeighted<double, float, unsigned char >*/,
+                0/*arithm::addWeighted<double, float, signed char >*/,
+                0/*arithm::addWeighted<double, float, unsigned short>*/,
+                0/*arithm::addWeighted<double, float, short >*/,
+                0/*arithm::addWeighted<double, float, int   >*/,
+                0/*arithm::addWeighted<double, float, float >*/,
+                0/*arithm::addWeighted<double, float, double>*/
             },
             {
-                addWeighted_gpu<double, double, unsigned char >,
-                addWeighted_gpu<double, double, signed char >,
-                addWeighted_gpu<double, double, unsigned short>,
-                addWeighted_gpu<double, double, short >,
-                addWeighted_gpu<double, double, int   >,
-                addWeighted_gpu<double, double, float >,
-                addWeighted_gpu<double, double, double>
+                arithm::addWeighted<double, double, unsigned char >,
+                arithm::addWeighted<double, double, signed char >,
+                arithm::addWeighted<double, double, unsigned short>,
+                arithm::addWeighted<double, double, short >,
+                arithm::addWeighted<double, double, int   >,
+                arithm::addWeighted<double, double, float >,
+                arithm::addWeighted<double, double, double>
             }
         }
     };
 
-    CV_Assert(src1.size() == src2.size());
-    CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));
+    int sdepth1 = src1.depth();
+    int sdepth2 = src2.depth();
+    ddepth = ddepth >= 0 ? CV_MAT_DEPTH(ddepth) : std::max(sdepth1, sdepth2);
+    const int cn = src1.channels();
 
-    dtype = dtype >= 0 ? CV_MAKETYPE(dtype, src1.channels()) : src1.type();
+    CV_Assert( src2.size() == src1.size() && src2.channels() == cn );
+    CV_Assert( sdepth1 <= CV_64F && sdepth2 <= CV_64F && ddepth <= CV_64F );
 
-    CV_Assert(src1.depth() <= CV_64F && src2.depth() <= CV_64F && CV_MAT_DEPTH(dtype) <= CV_64F);
-
-    if (src1.depth() == CV_64F || src2.depth() == CV_64F || CV_MAT_DEPTH(dtype) == CV_64F)
+    if (sdepth1 == CV_64F || sdepth2 == CV_64F || ddepth == CV_64F)
     {
-        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+        if (!deviceSupports(NATIVE_DOUBLE))
             CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
     }
 
-    dst.create(src1.size(), dtype);
+    dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
 
-    const GpuMat* psrc1 = &src1;
-    const GpuMat* psrc2 = &src2;
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-    if (src1.depth() > src2.depth())
+    if (sdepth1 > sdepth2)
     {
-        std::swap(psrc1, psrc2);
+        std::swap(src1_.data, src2_.data);
+        std::swap(src1_.step, src2_.step);
         std::swap(alpha, beta);
+        std::swap(sdepth1, sdepth2);
     }
 
-    const func_t func = funcs[psrc1->depth()][psrc2->depth()][dst.depth()];
+    const func_t func = funcs[sdepth1][sdepth2][ddepth];
 
     if (!func)
         CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(psrc1->reshape(1), alpha, psrc2->reshape(1), beta, gamma, dst.reshape(1), StreamAccessor::getStream(stream));
+    func(src1_, alpha, src2_, beta, gamma, dst_, StreamAccessor::getStream(stream));
 }
 
 #endif
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index abdcb0fa7..695ec9758 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -210,7 +210,6 @@ TEST_P(Add_Array, Accuracy)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype);
-    cv::Mat mask = randomMat(size, CV_8UC1, 0.0, 2.0);
 
     if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
     {
@@ -228,10 +227,10 @@ TEST_P(Add_Array, Accuracy)
     {
         cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
         dst.setTo(cv::Scalar::all(0));
-        cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second);
+        cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::gpu::GpuMat(), depth.second);
 
         cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
-        cv::add(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second);
+        cv::add(mat1, mat2, dst_gold, cv::noArray(), depth.second);
 
         EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
     }
@@ -244,6 +243,67 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Array, testing::Combine(
     ALL_CHANNELS,
     WHOLE_SUBMAT));
 
+PARAM_TEST_CASE(Add_Array_Mask, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, 1);
+        dtype = CV_MAKE_TYPE(depth.second, 1);
+    }
+};
+
+TEST_P(Add_Array_Mask, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+    cv::Mat mask = randomMat(size, CV_8UC1, 0, 2);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::add(loadMat(mat1), loadMat(mat2), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::add(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::add(mat1, mat2, dst_gold, mask, depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Add_Array_Mask, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    WHOLE_SUBMAT));
+
 ////////////////////////////////////////////////////////////////////////////////
 // Add_Scalar
 
@@ -362,6 +422,67 @@ PARAM_TEST_CASE(Subtract_Array, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDept
 };
 
 TEST_P(Subtract_Array, Accuracy)
+{
+    cv::Mat mat1 = randomMat(size, stype);
+    cv::Mat mat2 = randomMat(size, stype);
+
+    if ((depth.first == CV_64F || depth.second == CV_64F) && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::subtract(loadMat(mat1), loadMat(mat2), dst, cv::gpu::GpuMat(), depth.second);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
+        dst.setTo(cv::Scalar::all(0));
+        cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, cv::gpu::GpuMat(), depth.second);
+
+        cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
+        cv::subtract(mat1, mat2, dst_gold, cv::noArray(), depth.second);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    DEPTH_PAIRS,
+    ALL_CHANNELS,
+    WHOLE_SUBMAT));
+
+PARAM_TEST_CASE(Subtract_Array_Mask, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    std::pair<MatDepth, MatDepth> depth;
+    bool useRoi;
+
+    int stype;
+    int dtype;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        depth = GET_PARAM(2);
+        useRoi = GET_PARAM(3);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+
+        stype = CV_MAKE_TYPE(depth.first, 1);
+        dtype = CV_MAKE_TYPE(depth.second, 1);
+    }
+};
+
+TEST_P(Subtract_Array_Mask, Accuracy)
 {
     cv::Mat mat1 = randomMat(size, stype);
     cv::Mat mat2 = randomMat(size, stype);
@@ -383,20 +504,19 @@ TEST_P(Subtract_Array, Accuracy)
     {
         cv::gpu::GpuMat dst = createMat(size, dtype, useRoi);
         dst.setTo(cv::Scalar::all(0));
-        cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, channels == 1 ? loadMat(mask, useRoi) : cv::gpu::GpuMat(), depth.second);
+        cv::gpu::subtract(loadMat(mat1, useRoi), loadMat(mat2, useRoi), dst, loadMat(mask, useRoi), depth.second);
 
         cv::Mat dst_gold(size, dtype, cv::Scalar::all(0));
-        cv::subtract(mat1, mat2, dst_gold, channels == 1 ? mask : cv::noArray(), depth.second);
+        cv::subtract(mat1, mat2, dst_gold, mask, depth.second);
 
         EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
     }
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array, testing::Combine(
+INSTANTIATE_TEST_CASE_P(GPU_Core, Subtract_Array_Mask, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     DEPTH_PAIRS,
-    ALL_CHANNELS,
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -541,7 +661,7 @@ TEST_P(Multiply_Array, WithOutScale)
         cv::Mat dst_gold;
         cv::multiply(mat1, mat2, dst_gold, 1, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0);
     }
 }
 
@@ -571,7 +691,7 @@ TEST_P(Multiply_Array, WithScale)
         cv::Mat dst_gold;
         cv::multiply(mat1, mat2, dst_gold, scale, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 2.0);
     }
 }
 
@@ -726,7 +846,7 @@ TEST_P(Multiply_Scalar, WithOutScale)
         cv::Mat dst_gold;
         cv::multiply(mat, val, dst_gold, 1, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
     }
 }
 
@@ -757,7 +877,7 @@ TEST_P(Multiply_Scalar, WithScale)
         cv::Mat dst_gold;
         cv::multiply(mat, val, dst_gold, scale, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
     }
 }
 
@@ -1037,7 +1157,7 @@ TEST_P(Divide_Scalar, WithScale)
         cv::Mat dst_gold;
         cv::divide(mat, val, dst_gold, scale, depth.second);
 
-        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-4 : 0.0);
+        EXPECT_MAT_NEAR(dst_gold, dst, depth.first >= CV_32F || depth.second >= CV_32F ? 1e-2 : 0.0);
     }
 }