diff --git a/modules/gpuarithm/include/opencv2/gpuarithm.hpp b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
index 2b367fa1b..2fc6b48de 100644
--- a/modules/gpuarithm/include/opencv2/gpuarithm.hpp
+++ b/modules/gpuarithm/include/opencv2/gpuarithm.hpp
@@ -60,12 +60,14 @@ CV_EXPORTS void subtract(InputArray src1, InputArray src2, OutputArray dst, Inpu
 //! computes element-wise weighted product of the two arrays (dst = scale * src1 * src2)
 CV_EXPORTS void multiply(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
 
-//! computes element-wise weighted quotient of the two arrays (c = a / b)
-CV_EXPORTS void divide(const GpuMat& a, const GpuMat& b, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
-//! computes element-wise weighted quotient of matrix and scalar (c = a / s)
-CV_EXPORTS void divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+//! computes element-wise weighted quotient of the two arrays (dst = scale * (src1 / src2))
+CV_EXPORTS void divide(InputArray src1, InputArray src2, OutputArray dst, double scale = 1, int dtype = -1, Stream& stream = Stream::Null());
+
 //! computes element-wise weighted reciprocal of an array (dst = scale/src2)
-CV_EXPORTS void divide(double scale, const GpuMat& b, GpuMat& c, int dtype = -1, Stream& stream = Stream::Null());
+static inline void divide(double src1, InputArray src2, OutputArray dst, int dtype = -1, Stream& stream = Stream::Null())
+{
+    divide(src1, src2, dst, 1.0, dtype, stream);
+}
 
 //! computes the weighted sum of two arrays (dst = alpha*src1 + beta*src2 + gamma)
 CV_EXPORTS void addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst,
diff --git a/modules/gpuarithm/src/cuda/div_inv.cu b/modules/gpuarithm/src/cuda/div_inv.cu
deleted file mode 100644
index 9cfda933c..000000000
--- a/modules/gpuarithm/src/cuda/div_inv.cu
+++ /dev/null
@@ -1,144 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/simd_functions.hpp"
-
-#include "arithm_func_traits.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace arithm
-{
-    template <typename T, typename S, typename D> struct DivInv : unary_function<T, D>
-    {
-        S val;
-
-        __host__ explicit DivInv(S val_) : val(val_) {}
-
-        __device__ __forceinline__ D operator ()(T a) const
-        {
-            return a != 0 ? saturate_cast<D>(val / a) : 0;
-        }
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
-    {
-    };
-}}}
-
-namespace arithm
-{
-    template <typename T, typename S, typename D>
-    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
-    {
-        DivInv<T, S, D> op(static_cast<S>(val));
-        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
-    }
-
-    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    template void divInv<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-
-    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}
-
-#endif // CUDA_DISABLER
diff --git a/modules/gpuarithm/src/cuda/div_scalar.cu b/modules/gpuarithm/src/cuda/div_scalar.cu
index 42ba90cb0..464c4adf8 100644
--- a/modules/gpuarithm/src/cuda/div_scalar.cu
+++ b/modules/gpuarithm/src/cuda/div_scalar.cu
@@ -66,6 +66,18 @@ namespace arithm
             return saturate_cast<D>(a / val);
         }
     };
+
+    template <typename T, typename S, typename D> struct DivScalarInv : unary_function<T, D>
+    {
+        S val;
+
+        explicit DivScalarInv(S val_) : val(val_) {}
+
+        __device__ __forceinline__ D operator ()(T a) const
+        {
+            return a != 0 ? saturate_cast<D>(val / a) : 0;
+        }
+    };
 }
 
 namespace cv { namespace gpu { namespace cudev
@@ -73,72 +85,84 @@ namespace cv { namespace gpu { namespace cudev
     template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalar<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
     {
     };
+
+    template <typename T, typename S, typename D> struct TransformFunctorTraits< arithm::DivScalarInv<T, S, D> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(D)>
+    {
+    };
 }}}
 
 namespace arithm
 {
     template <typename T, typename S, typename D>
-    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream)
+    void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream)
     {
-        DivScalar<T, S, D> op(static_cast<S>(val));
-        cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        if (inv)
+        {
+            DivScalarInv<T, S, D> op(static_cast<S>(val));
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
+        else
+        {
+            DivScalar<T, S, D> op(static_cast<S>(val));
+            cudev::transform((PtrStepSz<T>) src1, (PtrStepSz<D>) dst, op, WithOutMask(), stream);
+        }
     }
 
-    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<uchar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<schar, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<ushort, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<ushort, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<short, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<short, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<short, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<int, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<int, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<int, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, float, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<float, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 
-    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, schar>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, ushort>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, short>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
+    template void divScalar<double, double, double>(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 }
 
 #endif // CUDA_DISABLER
diff --git a/modules/gpuarithm/src/element_operations.cpp b/modules/gpuarithm/src/element_operations.cpp
index 76bfb9ba3..24f67b7db 100644
--- a/modules/gpuarithm/src/element_operations.cpp
+++ b/modules/gpuarithm/src/element_operations.cpp
@@ -53,9 +53,7 @@ void cv::gpu::subtract(InputArray, InputArray, OutputArray, InputArray, int, Str
 
 void cv::gpu::multiply(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
 
-void cv::gpu::divide(const GpuMat&, const GpuMat&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::divide(const GpuMat&, const Scalar&, GpuMat&, double, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::divide(double, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::divide(InputArray, InputArray, OutputArray, double, int, Stream&) { throw_no_cuda(); }
 
 void cv::gpu::absdiff(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
 void cv::gpu::absdiff(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_no_cuda(); }
@@ -120,7 +118,7 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool,
 namespace
 {
     typedef void (*mat_mat_func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream);
-    typedef void (*mat_scalar_func_t)(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, Stream& stream);
+    typedef void (*mat_scalar_func_t)(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double scale, Stream& stream);
 
     void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst, InputArray _mask, double scale, int dtype, Stream& stream,
                    mat_mat_func_t mat_mat_func, mat_scalar_func_t mat_scalar_func)
@@ -151,10 +149,6 @@ namespace
         {
             CV_Assert( scalar.total() <= 4 );
             scalar.convertTo(Mat_<double>(scalar.rows, scalar.cols, &val[0]), CV_64F);
-            val[0] *= scale;
-            val[1] *= scale;
-            val[2] *= scale;
-            val[3] *= scale;
         }
 
         GpuMat mask = _mask.getGpuMat();
@@ -182,9 +176,9 @@ namespace
         GpuMat dst = _dst.getGpuMat();
 
         if (isScalar1)
-            mat_scalar_func(src2, val, true, dst, mask, stream);
+            mat_scalar_func(src2, val, true, dst, mask, scale, stream);
         else if (isScalar2)
-            mat_scalar_func(src1, val, false, dst, mask, stream);
+            mat_scalar_func(src1, val, false, dst, mask, scale, stream);
         else
             mat_mat_func(src1, src2, dst, mask, scale, stream);
     }
@@ -505,7 +499,7 @@ namespace arithm
     void addScalar(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, Stream& _stream)
+static void addScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat& mask, double, Stream& _stream)
 {
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
@@ -756,7 +750,7 @@ namespace arithm
     void subScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
 }
 
-static void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, Stream& _stream)
+static void subScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat& mask, double, Stream& _stream)
 {
     typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
     static const func_t funcs[7][7] =
@@ -973,7 +967,7 @@ namespace arithm
     void mulScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
 }
 
-static void mulScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, Stream& _stream)
+static void mulScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const GpuMat&, double scale, Stream& _stream)
 {
     typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
@@ -1061,6 +1055,11 @@ static void mulScalar(const GpuMat& src, Scalar val, bool, GpuMat& dst, const Gp
 
     cudaStream_t stream = StreamAccessor::getStream(_stream);
 
+    val[0] *= scale;
+    val[1] *= scale;
+    val[2] *= scale;
+    val[3] *= scale;
+
     const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
     if (ddepth == sdepth && cn > 1 && npp_func != 0)
     {
@@ -1123,204 +1122,167 @@ namespace arithm
     void divMat(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
 }
 
-void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
+static void divMat(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat&, double scale, Stream& _stream)
 {
-    using namespace arithm;
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    if (src1.type() == CV_8UC4 && src2.type() == CV_32FC1)
+    typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+    static const func_t funcs[7][7] =
     {
-        CV_Assert( src1.size() == src2.size() );
-
-        dst.create(src1.size(), src1.type());
-
-        divMat_8uc4_32f(src1, src2, dst, stream);
-    }
-    else if (src1.type() == CV_16SC4 && src2.type() == CV_32FC1)
-    {
-        CV_Assert( src1.size() == src2.size() );
-
-        dst.create(src1.size(), src1.type());
-
-        divMat_16sc4_32f(src1, src2, dst, stream);
-    }
-    else
-    {
-        typedef void (*func_t)(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
-        static const func_t funcs[7][7] =
         {
-            {
-                divMat<unsigned char, float, unsigned char>,
-                divMat<unsigned char, float, signed char>,
-                divMat<unsigned char, float, unsigned short>,
-                divMat<unsigned char, float, short>,
-                divMat<unsigned char, float, int>,
-                divMat<unsigned char, float, float>,
-                divMat<unsigned char, double, double>
-            },
-            {
-                divMat<signed char, float, unsigned char>,
-                divMat<signed char, float, signed char>,
-                divMat<signed char, float, unsigned short>,
-                divMat<signed char, float, short>,
-                divMat<signed char, float, int>,
-                divMat<signed char, float, float>,
-                divMat<signed char, double, double>
-            },
-            {
-                0 /*divMat<unsigned short, float, unsigned char>*/,
-                0 /*divMat<unsigned short, float, signed char>*/,
-                divMat<unsigned short, float, unsigned short>,
-                divMat<unsigned short, float, short>,
-                divMat<unsigned short, float, int>,
-                divMat<unsigned short, float, float>,
-                divMat<unsigned short, double, double>
-            },
-            {
-                0 /*divMat<short, float, unsigned char>*/,
-                0 /*divMat<short, float, signed char>*/,
-                divMat<short, float, unsigned short>,
-                divMat<short, float, short>,
-                divMat<short, float, int>,
-                divMat<short, float, float>,
-                divMat<short, double, double>
-            },
-            {
-                0 /*divMat<int, float, unsigned char>*/,
-                0 /*divMat<int, float, signed char>*/,
-                0 /*divMat<int, float, unsigned short>*/,
-                0 /*divMat<int, float, short>*/,
-                divMat<int, float, int>,
-                divMat<int, float, float>,
-                divMat<int, double, double>
-            },
-            {
-                0 /*divMat<float, float, unsigned char>*/,
-                0 /*divMat<float, float, signed char>*/,
-                0 /*divMat<float, float, unsigned short>*/,
-                0 /*divMat<float, float, short>*/,
-                0 /*divMat<float, float, int>*/,
-                divMat<float, float, float>,
-                divMat<float, double, double>
-            },
-            {
-                0 /*divMat<double, double, unsigned char>*/,
-                0 /*divMat<double, double, signed char>*/,
-                0 /*divMat<double, double, unsigned short>*/,
-                0 /*divMat<double, double, short>*/,
-                0 /*divMat<double, double, int>*/,
-                0 /*divMat<double, double, float>*/,
-                divMat<double, double, double>
-            }
-        };
-
-        if (dtype < 0)
-            dtype = src1.depth();
-
-        const int sdepth = src1.depth();
-        const int ddepth = CV_MAT_DEPTH(dtype);
-        const int cn = src1.channels();
-
-        CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-        CV_Assert( src2.type() == src1.type() && src2.size() == src1.size() );
-
-        if (sdepth == CV_64F || ddepth == CV_64F)
+            arithm::divMat<unsigned char, float, unsigned char>,
+            arithm::divMat<unsigned char, float, signed char>,
+            arithm::divMat<unsigned char, float, unsigned short>,
+            arithm::divMat<unsigned char, float, short>,
+            arithm::divMat<unsigned char, float, int>,
+            arithm::divMat<unsigned char, float, float>,
+            arithm::divMat<unsigned char, double, double>
+        },
         {
-            if (!deviceSupports(NATIVE_DOUBLE))
-                CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+            arithm::divMat<signed char, float, unsigned char>,
+            arithm::divMat<signed char, float, signed char>,
+            arithm::divMat<signed char, float, unsigned short>,
+            arithm::divMat<signed char, float, short>,
+            arithm::divMat<signed char, float, int>,
+            arithm::divMat<signed char, float, float>,
+            arithm::divMat<signed char, double, double>
+        },
+        {
+            0 /*arithm::divMat<unsigned short, float, unsigned char>*/,
+            0 /*arithm::divMat<unsigned short, float, signed char>*/,
+            arithm::divMat<unsigned short, float, unsigned short>,
+            arithm::divMat<unsigned short, float, short>,
+            arithm::divMat<unsigned short, float, int>,
+            arithm::divMat<unsigned short, float, float>,
+            arithm::divMat<unsigned short, double, double>
+        },
+        {
+            0 /*arithm::divMat<short, float, unsigned char>*/,
+            0 /*arithm::divMat<short, float, signed char>*/,
+            arithm::divMat<short, float, unsigned short>,
+            arithm::divMat<short, float, short>,
+            arithm::divMat<short, float, int>,
+            arithm::divMat<short, float, float>,
+            arithm::divMat<short, double, double>
+        },
+        {
+            0 /*arithm::divMat<int, float, unsigned char>*/,
+            0 /*arithm::divMat<int, float, signed char>*/,
+            0 /*arithm::divMat<int, float, unsigned short>*/,
+            0 /*arithm::divMat<int, float, short>*/,
+            arithm::divMat<int, float, int>,
+            arithm::divMat<int, float, float>,
+            arithm::divMat<int, double, double>
+        },
+        {
+            0 /*arithm::divMat<float, float, unsigned char>*/,
+            0 /*arithm::divMat<float, float, signed char>*/,
+            0 /*arithm::divMat<float, float, unsigned short>*/,
+            0 /*arithm::divMat<float, float, short>*/,
+            0 /*arithm::divMat<float, float, int>*/,
+            arithm::divMat<float, float, float>,
+            arithm::divMat<float, double, double>
+        },
+        {
+            0 /*arithm::divMat<double, double, unsigned char>*/,
+            0 /*arithm::divMat<double, double, signed char>*/,
+            0 /*arithm::divMat<double, double, unsigned short>*/,
+            0 /*arithm::divMat<double, double, short>*/,
+            0 /*arithm::divMat<double, double, int>*/,
+            0 /*arithm::divMat<double, double, float>*/,
+            arithm::divMat<double, double, double>
         }
+    };
 
-        dst.create(src1.size(), CV_MAKE_TYPE(ddepth, cn));
+    const int sdepth = src1.depth();
+    const int ddepth = dst.depth();
+    const int cn = src1.channels();
 
-        PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
-        PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
-        PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-        const func_t func = funcs[sdepth][ddepth];
+    PtrStepSzb src1_(src1.rows, src1.cols * cn, src1.data, src1.step);
+    PtrStepSzb src2_(src1.rows, src1.cols * cn, src2.data, src2.step);
+    PtrStepSzb dst_(src1.rows, src1.cols * cn, dst.data, dst.step);
 
-        if (!func)
-            CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+    const func_t func = funcs[sdepth][ddepth];
 
-        func(src1_, src2_, dst_, scale, stream);
-    }
+    if (!func)
+        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
+
+    func(src1_, src2_, dst_, scale, stream);
 }
 
 namespace arithm
 {
     template <typename T, typename S, typename D>
-    void divScalar(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    void divScalar(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
 }
 
-void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
+static void divScalar(const GpuMat& src, Scalar val, bool inv, GpuMat& dst, const GpuMat&, double scale, Stream& _stream)
 {
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+    typedef void (*func_t)(PtrStepSzb src1, double val, bool inv, PtrStepSzb dst, cudaStream_t stream);
     static const func_t funcs[7][7] =
     {
         {
-            divScalar<unsigned char, float, unsigned char>,
-            divScalar<unsigned char, float, signed char>,
-            divScalar<unsigned char, float, unsigned short>,
-            divScalar<unsigned char, float, short>,
-            divScalar<unsigned char, float, int>,
-            divScalar<unsigned char, float, float>,
-            divScalar<unsigned char, double, double>
+            arithm::divScalar<unsigned char, float, unsigned char>,
+            arithm::divScalar<unsigned char, float, signed char>,
+            arithm::divScalar<unsigned char, float, unsigned short>,
+            arithm::divScalar<unsigned char, float, short>,
+            arithm::divScalar<unsigned char, float, int>,
+            arithm::divScalar<unsigned char, float, float>,
+            arithm::divScalar<unsigned char, double, double>
         },
         {
-            divScalar<signed char, float, unsigned char>,
-            divScalar<signed char, float, signed char>,
-            divScalar<signed char, float, unsigned short>,
-            divScalar<signed char, float, short>,
-            divScalar<signed char, float, int>,
-            divScalar<signed char, float, float>,
-            divScalar<signed char, double, double>
+            arithm::divScalar<signed char, float, unsigned char>,
+            arithm::divScalar<signed char, float, signed char>,
+            arithm::divScalar<signed char, float, unsigned short>,
+            arithm::divScalar<signed char, float, short>,
+            arithm::divScalar<signed char, float, int>,
+            arithm::divScalar<signed char, float, float>,
+            arithm::divScalar<signed char, double, double>
         },
         {
-            0 /*divScalar<unsigned short, float, unsigned char>*/,
-            0 /*divScalar<unsigned short, float, signed char>*/,
-            divScalar<unsigned short, float, unsigned short>,
-            divScalar<unsigned short, float, short>,
-            divScalar<unsigned short, float, int>,
-            divScalar<unsigned short, float, float>,
-            divScalar<unsigned short, double, double>
+            0 /*arithm::divScalar<unsigned short, float, unsigned char>*/,
+            0 /*arithm::divScalar<unsigned short, float, signed char>*/,
+            arithm::divScalar<unsigned short, float, unsigned short>,
+            arithm::divScalar<unsigned short, float, short>,
+            arithm::divScalar<unsigned short, float, int>,
+            arithm::divScalar<unsigned short, float, float>,
+            arithm::divScalar<unsigned short, double, double>
         },
         {
-            0 /*divScalar<short, float, unsigned char>*/,
-            0 /*divScalar<short, float, signed char>*/,
-            divScalar<short, float, unsigned short>,
-            divScalar<short, float, short>,
-            divScalar<short, float, int>,
-            divScalar<short, float, float>,
-            divScalar<short, double, double>
+            0 /*arithm::divScalar<short, float, unsigned char>*/,
+            0 /*arithm::divScalar<short, float, signed char>*/,
+            arithm::divScalar<short, float, unsigned short>,
+            arithm::divScalar<short, float, short>,
+            arithm::divScalar<short, float, int>,
+            arithm::divScalar<short, float, float>,
+            arithm::divScalar<short, double, double>
         },
         {
-            0 /*divScalar<int, float, unsigned char>*/,
-            0 /*divScalar<int, float, signed char>*/,
-            0 /*divScalar<int, float, unsigned short>*/,
-            0 /*divScalar<int, float, short>*/,
-            divScalar<int, float, int>,
-            divScalar<int, float, float>,
-            divScalar<int, double, double>
+            0 /*arithm::divScalar<int, float, unsigned char>*/,
+            0 /*arithm::divScalar<int, float, signed char>*/,
+            0 /*arithm::divScalar<int, float, unsigned short>*/,
+            0 /*arithm::divScalar<int, float, short>*/,
+            arithm::divScalar<int, float, int>,
+            arithm::divScalar<int, float, float>,
+            arithm::divScalar<int, double, double>
         },
         {
-            0 /*divScalar<float, float, unsigned char>*/,
-            0 /*divScalar<float, float, signed char>*/,
-            0 /*divScalar<float, float, unsigned short>*/,
-            0 /*divScalar<float, float, short>*/,
-            0 /*divScalar<float, float, int>*/,
-            divScalar<float, float, float>,
-            divScalar<float, double, double>
+            0 /*arithm::divScalar<float, float, unsigned char>*/,
+            0 /*arithm::divScalar<float, float, signed char>*/,
+            0 /*arithm::divScalar<float, float, unsigned short>*/,
+            0 /*arithm::divScalar<float, float, short>*/,
+            0 /*arithm::divScalar<float, float, int>*/,
+            arithm::divScalar<float, float, float>,
+            arithm::divScalar<float, double, double>
         },
         {
-            0 /*divScalar<double, double, unsigned char>*/,
-            0 /*divScalar<double, double, signed char>*/,
-            0 /*divScalar<double, double, unsigned short>*/,
-            0 /*divScalar<double, double, short>*/,
-            0 /*divScalar<double, double, int>*/,
-            0 /*divScalar<double, double, float>*/,
-            divScalar<double, double, double>
+            0 /*arithm::divScalar<double, double, unsigned char>*/,
+            0 /*arithm::divScalar<double, double, signed char>*/,
+            0 /*arithm::divScalar<double, double, unsigned short>*/,
+            0 /*arithm::divScalar<double, double, short>*/,
+            0 /*arithm::divScalar<double, double, int>*/,
+            0 /*arithm::divScalar<double, double, float>*/,
+            arithm::divScalar<double, double, double>
         }
     };
 
@@ -1336,32 +1298,31 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
         {0                                                    , 0, 0                                                    , 0                                                    }
     };
 
-    if (dtype < 0)
-        dtype = src.depth();
-
     const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
+    const int ddepth = dst.depth();
     const int cn = src.channels();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( cn <= 4 );
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-    if (sdepth == CV_64F || ddepth == CV_64F)
+    if (inv)
     {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        val[0] *= scale;
+        val[1] *= scale;
+        val[2] *= scale;
+        val[3] *= scale;
+    }
+    else
+    {
+        val[0] /= scale;
+        val[1] /= scale;
+        val[2] /= scale;
+        val[3] /= scale;
     }
 
-    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    const Scalar nsc(sc.val[0] / scale, sc.val[1] / scale, sc.val[2] / scale, sc.val[3] / scale);
-
     const npp_func_t npp_func = npp_funcs[sdepth][cn - 1];
-    if (ddepth == sdepth && cn > 1 && npp_func != 0)
+    if (ddepth == sdepth && cn > 1 && npp_func != 0 && !inv)
     {
-        npp_func(src, nsc, dst, stream);
+        npp_func(src, val, dst, stream);
         return;
     }
 
@@ -1372,113 +1333,39 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc
     if (!func)
         CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
 
-    func(src, nsc.val[0], dst, stream);
+    func(src, val[0], inv, dst, stream);
 }
 
-namespace arithm
+void cv::gpu::divide(InputArray _src1, InputArray _src2, OutputArray _dst, double scale, int dtype, Stream& stream)
 {
-    template <typename T, typename S, typename D>
-    void divInv(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-}
-
-void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
-{
-    using namespace arithm;
-
-    typedef void (*func_t)(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
-    static const func_t funcs[7][7] =
+    if (_src1.type() == CV_8UC4 && _src2.type() == CV_32FC1)
     {
-        {
-            divInv<unsigned char, float, unsigned char>,
-            divInv<unsigned char, float, signed char>,
-            divInv<unsigned char, float, unsigned short>,
-            divInv<unsigned char, float, short>,
-            divInv<unsigned char, float, int>,
-            divInv<unsigned char, float, float>,
-            divInv<unsigned char, double, double>
-        },
-        {
-            divInv<signed char, float, unsigned char>,
-            divInv<signed char, float, signed char>,
-            divInv<signed char, float, unsigned short>,
-            divInv<signed char, float, short>,
-            divInv<signed char, float, int>,
-            divInv<signed char, float, float>,
-            divInv<signed char, double, double>
-        },
-        {
-            0 /*divInv<unsigned short, float, unsigned char>*/,
-            0 /*divInv<unsigned short, float, signed char>*/,
-            divInv<unsigned short, float, unsigned short>,
-            divInv<unsigned short, float, short>,
-            divInv<unsigned short, float, int>,
-            divInv<unsigned short, float, float>,
-            divInv<unsigned short, double, double>
-        },
-        {
-            0 /*divInv<short, float, unsigned char>*/,
-            0 /*divInv<short, float, signed char>*/,
-            divInv<short, float, unsigned short>,
-            divInv<short, float, short>,
-            divInv<short, float, int>,
-            divInv<short, float, float>,
-            divInv<short, double, double>
-        },
-        {
-            0 /*divInv<int, float, unsigned char>*/,
-            0 /*divInv<int, float, signed char>*/,
-            0 /*divInv<int, float, unsigned short>*/,
-            0 /*divInv<int, float, short>*/,
-            divInv<int, float, int>,
-            divInv<int, float, float>,
-            divInv<int, double, double>
-        },
-        {
-            0 /*divInv<float, float, unsigned char>*/,
-            0 /*divInv<float, float, signed char>*/,
-            0 /*divInv<float, float, unsigned short>*/,
-            0 /*divInv<float, float, short>*/,
-            0 /*divInv<float, float, int>*/,
-            divInv<float, float, float>,
-            divInv<float, double, double>
-        },
-        {
-            0 /*divInv<double, double, unsigned char>*/,
-            0 /*divInv<double, double, signed char>*/,
-            0 /*divInv<double, double, unsigned short>*/,
-            0 /*divInv<double, double, short>*/,
-            0 /*divInv<double, double, int>*/,
-            0 /*divInv<double, double, float>*/,
-            divInv<double, double, double>
-        }
-    };
+        GpuMat src1 = _src1.getGpuMat();
+        GpuMat src2 = _src2.getGpuMat();
 
-    if (dtype < 0)
-        dtype = src.depth();
+        CV_Assert( src1.size() == src2.size() );
 
-    const int sdepth = src.depth();
-    const int ddepth = CV_MAT_DEPTH(dtype);
-    const int cn = src.channels();
+        _dst.create(src1.size(), src1.type());
+        GpuMat dst = _dst.getGpuMat();
 
-    CV_Assert( sdepth <= CV_64F && ddepth <= CV_64F );
-    CV_Assert( cn == 1 );
-
-    if (sdepth == CV_64F || ddepth == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+        arithm::divMat_8uc4_32f(src1, src2, dst, StreamAccessor::getStream(stream));
     }
+    else if (_src1.type() == CV_16SC4 && _src2.type() == CV_32FC1)
+    {
+        GpuMat src1 = _src1.getGpuMat();
+        GpuMat src2 = _src2.getGpuMat();
 
-    dst.create(src.size(), CV_MAKE_TYPE(ddepth, cn));
+        CV_Assert( src1.size() == src2.size() );
 
-    cudaStream_t stream = StreamAccessor::getStream(s);
+        _dst.create(src1.size(), src1.type());
+        GpuMat dst = _dst.getGpuMat();
 
-    const func_t func = funcs[sdepth][ddepth];
-
-    if (!func)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported combination of source and destination types");
-
-    func(src, scale, dst, stream);
+        arithm::divMat_16sc4_32f(src1, src2, dst, StreamAccessor::getStream(stream));
+    }
+    else
+    {
+        arithm_op(_src1, _src2, _dst, GpuMat(), scale, dtype, stream, divMat, divScalar);
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/modules/gpuarithm/test/test_element_operations.cpp b/modules/gpuarithm/test/test_element_operations.cpp
index ece38311e..345c96015 100644
--- a/modules/gpuarithm/test/test_element_operations.cpp
+++ b/modules/gpuarithm/test/test_element_operations.cpp
@@ -1299,9 +1299,9 @@ INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar, testing::Combine(
     WHOLE_SUBMAT));
 
 ////////////////////////////////////////////////////////////////////////////////
-// Divide_Scalar_Inv
+// Divide_Scalar_First
 
-PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
+PARAM_TEST_CASE(Divide_Scalar_First, cv::gpu::DeviceInfo, cv::Size, std::pair<MatDepth, MatDepth>, UseRoi)
 {
     cv::gpu::DeviceInfo devInfo;
     cv::Size size;
@@ -1319,7 +1319,7 @@ PARAM_TEST_CASE(Divide_Scalar_Inv, cv::gpu::DeviceInfo, cv::Size, std::pair<MatD
     }
 };
 
-GPU_TEST_P(Divide_Scalar_Inv, Accuracy)
+GPU_TEST_P(Divide_Scalar_First, Accuracy)
 {
     double scale = randomDouble(0.0, 255.0);
     cv::Mat mat = randomMat(size, depth.first, 1.0, 255.0);
@@ -1348,7 +1348,7 @@ GPU_TEST_P(Divide_Scalar_Inv, Accuracy)
     }
 }
 
-INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar_Inv, testing::Combine(
+INSTANTIATE_TEST_CASE_P(GPU_Arithm, Divide_Scalar_First, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
     DEPTH_PAIRS,