Merge pull request #964 from jet47:cuda-5.5-support

2013-06-10 12:05:52 +04:00
parent c929547164 4559d46190
commit 173442bb2e
28 changed files with 1001 additions and 359 deletions
--- a/modules/core/include/opencv2/core/cuda/vec_math.hpp
+++ b/modules/core/include/opencv2/core/cuda/vec_math.hpp
--- a/modules/core/include/opencv2/core/gpu_private.hpp
+++ b/modules/core/include/opencv2/core/gpu_private.hpp
@@ -60,6 +60,8 @@
 #  include "opencv2/core/stream_accessor.hpp"
 #  include "opencv2/core/cuda/common.hpp"

+#  define NPP_VERSION (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD)
+
 #  define CUDART_MINIMUM_REQUIRED_VERSION 4020

 #  if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -1547,48 +1547,90 @@ namespace

    const ErrorEntry npp_errors [] =
    {
-        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
-        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
-        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
-
-#if defined (_MSC_VER)
+    #if defined (_MSC_VER)
        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
-#endif
+    #endif

+    #if NPP_VERSION < 5500
        error_entry( NPP_BAD_ARG_ERROR ),
-        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_TEXTURE_BIND_ERROR ),
        error_entry( NPP_COEFF_ERROR ),
        error_entry( NPP_RECT_ERROR ),
        error_entry( NPP_QUAD_ERROR ),
+        error_entry( NPP_MEMFREE_ERR ),
+        error_entry( NPP_MEMSET_ERR ),
+        error_entry( NPP_MEM_ALLOC_ERR ),
+        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERR ),
+        error_entry( NPP_INVALID_INPUT ),
+        error_entry( NPP_POINTER_ERROR ),
+        error_entry( NPP_WARNING ),
+        error_entry( NPP_ODD_ROI_WARNING ),
+    #else
+        error_entry( NPP_INVALID_HOST_POINTER_ERROR ),
+        error_entry( NPP_INVALID_DEVICE_POINTER_ERROR ),
+        error_entry( NPP_LUT_PALETTE_BITSIZE_ERROR ),
+        error_entry( NPP_ZC_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_MEMFREE_ERROR ),
+        error_entry( NPP_MEMSET_ERROR ),
+        error_entry( NPP_QUALITY_INDEX_ERROR ),
+        error_entry( NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_CHANNEL_ORDER_ERROR ),
+        error_entry( NPP_ZERO_MASK_VALUE_ERROR ),
+        error_entry( NPP_QUADRANGLE_ERROR ),
+        error_entry( NPP_RECTANGLE_ERROR ),
+        error_entry( NPP_COEFFICIENT_ERROR ),
+        error_entry( NPP_NUMBER_OF_CHANNELS_ERROR ),
+        error_entry( NPP_COI_ERROR ),
+        error_entry( NPP_DIVISOR_ERROR ),
+        error_entry( NPP_CHANNEL_ERROR ),
+        error_entry( NPP_STRIDE_ERROR ),
+        error_entry( NPP_ANCHOR_ERROR ),
+        error_entry( NPP_MASK_SIZE_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERROR ),
+        error_entry( NPP_MOMENT_00_ZERO_ERROR ),
+        error_entry( NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR ),
+        error_entry( NPP_THRESHOLD_ERROR ),
+        error_entry( NPP_CONTEXT_MATCH_ERROR ),
+        error_entry( NPP_FFT_FLAG_ERROR ),
+        error_entry( NPP_FFT_ORDER_ERROR ),
+        error_entry( NPP_SCALE_RANGE_ERROR ),
+        error_entry( NPP_DATA_TYPE_ERROR ),
+        error_entry( NPP_OUT_OFF_RANGE_ERROR ),
+        error_entry( NPP_DIVIDE_BY_ZERO_ERROR ),
+        error_entry( NPP_MEMORY_ALLOCATION_ERR ),
+        error_entry( NPP_RANGE_ERROR ),
+        error_entry( NPP_BAD_ARGUMENT_ERROR ),
+        error_entry( NPP_NO_MEMORY_ERROR ),
+        error_entry( NPP_ERROR_RESERVED ),
+        error_entry( NPP_NO_OPERATION_WARNING ),
+        error_entry( NPP_DIVIDE_BY_ZERO_WARNING ),
+        error_entry( NPP_WRONG_INTERSECTION_ROI_WARNING ),
+    #endif
+
+        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
+        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
+        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_TEXTURE_BIND_ERROR ),
        error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
        error_entry( NPP_NOT_EVEN_STEP_ERROR ),
        error_entry( NPP_INTERPOLATION_ERROR ),
        error_entry( NPP_RESIZE_FACTOR_ERROR ),
        error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
-        error_entry( NPP_MEMFREE_ERR ),
-        error_entry( NPP_MEMSET_ERR ),
        error_entry( NPP_MEMCPY_ERROR ),
-        error_entry( NPP_MEM_ALLOC_ERR ),
-        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_MIRROR_FLIP_ERR ),
-        error_entry( NPP_INVALID_INPUT ),
        error_entry( NPP_ALIGNMENT_ERROR ),
        error_entry( NPP_STEP_ERROR ),
        error_entry( NPP_SIZE_ERROR ),
-        error_entry( NPP_POINTER_ERROR ),
        error_entry( NPP_NULL_POINTER_ERROR ),
        error_entry( NPP_CUDA_KERNEL_EXECUTION_ERROR ),
        error_entry( NPP_NOT_IMPLEMENTED_ERROR ),
        error_entry( NPP_ERROR ),
        error_entry( NPP_NO_ERROR ),
        error_entry( NPP_SUCCESS ),
-        error_entry( NPP_WARNING ),
        error_entry( NPP_WRONG_INTERSECTION_QUAD_WARNING ),
        error_entry( NPP_MISALIGNED_DST_ROI_WARNING ),
        error_entry( NPP_AFFINE_QUAD_INCORRECT_WARNING ),
-        error_entry( NPP_DOUBLE_SIZE_WARNING ),
-        error_entry( NPP_ODD_ROI_WARNING )
+        error_entry( NPP_DOUBLE_SIZE_WARNING )
    };

    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace cudev

            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
            {
-                I d = a - b;
+                I d = saturate_cast<I>(a - b);
                return lo.x <= d.x && d.x <= hi.x &&
                       lo.y <= d.y && d.y <= hi.y &&
                       lo.z <= d.z && d.z <= hi.z;
@@ -169,7 +169,7 @@ namespace cv { namespace gpu { namespace cudev

            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
            {
-                I d = a - b;
+                I d = saturate_cast<I>(a - b);
                return lo.x <= d.x && d.x <= hi.x &&
                       lo.y <= d.y && d.y <= hi.y &&
                       lo.z <= d.z && d.z <= hi.z &&
--- a/modules/gpuarithm/src/cuda/absdiff_mat.cu
+++ b/modules/gpuarithm/src/cuda/absdiff_mat.cu
@@ -62,8 +62,8 @@ namespace arithm
            return vabsdiff4(a, b);
        }

-        __device__ __forceinline__ VAbsDiff4() {}
-        __device__ __forceinline__ VAbsDiff4(const VAbsDiff4& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff4() {}
+        __host__ __device__ __forceinline__ VAbsDiff4(const VAbsDiff4&) {}
    };

    struct VAbsDiff2 : binary_function<uint, uint, uint>
@@ -73,8 +73,8 @@ namespace arithm
            return vabsdiff2(a, b);
        }

-        __device__ __forceinline__ VAbsDiff2() {}
-        __device__ __forceinline__ VAbsDiff2(const VAbsDiff2& other) {}
+        __host__ __device__ __forceinline__ VAbsDiff2() {}
+        __host__ __device__ __forceinline__ VAbsDiff2(const VAbsDiff2&) {}
    };

    __device__ __forceinline__ int _abs(int a)
@@ -97,8 +97,8 @@ namespace arithm
            return saturate_cast<T>(_abs(a - b));
        }

-        __device__ __forceinline__ AbsDiffMat() {}
-        __device__ __forceinline__ AbsDiffMat(const AbsDiffMat& other) {}
+        __host__ __device__ __forceinline__ AbsDiffMat() {}
+        __host__ __device__ __forceinline__ AbsDiffMat(const AbsDiffMat&) {}
    };
 }

--- a/modules/gpuarithm/src/cuda/absdiff_scalar.cu
+++ b/modules/gpuarithm/src/cuda/absdiff_scalar.cu
@@ -59,7 +59,7 @@ namespace arithm
    {
        S val;

-        explicit AbsDiffScalar(S val_) : val(val_) {}
+        __host__ explicit AbsDiffScalar(S val_) : val(val_) {}

        __device__ __forceinline__ T operator ()(T a) const
        {
--- a/modules/gpuarithm/src/cuda/add_mat.cu
+++ b/modules/gpuarithm/src/cuda/add_mat.cu
@@ -62,8 +62,8 @@ namespace arithm
            return vadd4(a, b);
        }

-        __device__ __forceinline__ VAdd4() {}
-        __device__ __forceinline__ VAdd4(const VAdd4& other) {}
+        __host__ __device__ __forceinline__ VAdd4() {}
+        __host__ __device__ __forceinline__ VAdd4(const VAdd4&) {}
    };

    struct VAdd2 : binary_function<uint, uint, uint>
@@ -73,8 +73,8 @@ namespace arithm
            return vadd2(a, b);
        }

-        __device__ __forceinline__ VAdd2() {}
-        __device__ __forceinline__ VAdd2(const VAdd2& other) {}
+        __host__ __device__ __forceinline__ VAdd2() {}
+        __host__ __device__ __forceinline__ VAdd2(const VAdd2&) {}
    };

    template <typename T, typename D> struct AddMat : binary_function<T, T, D>
@@ -84,8 +84,8 @@ namespace arithm
            return saturate_cast<D>(a + b);
        }

-        __device__ __forceinline__ AddMat() {}
-        __device__ __forceinline__ AddMat(const AddMat& other) {}
+        __host__ __device__ __forceinline__ AddMat() {}
+        __host__ __device__ __forceinline__ AddMat(const AddMat&) {}
    };
 }

--- a/modules/gpuarithm/src/cuda/add_scalar.cu
+++ b/modules/gpuarithm/src/cuda/add_scalar.cu
@@ -59,7 +59,7 @@ namespace arithm
    {
        S val;

-        explicit AddScalar(S val_) : val(val_) {}
+        __host__ explicit AddScalar(S val_) : val(val_) {}

        __device__ __forceinline__ D operator ()(T a) const
        {
--- a/modules/gpuarithm/src/cuda/add_weighted.cu
+++ b/modules/gpuarithm/src/cuda/add_weighted.cu
@@ -74,7 +74,7 @@ namespace arithm
        float beta;
        float gamma;

-        AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}
+        __host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(static_cast<float>(alpha_)), beta(static_cast<float>(beta_)), gamma(static_cast<float>(gamma_)) {}

        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
        {
@@ -87,7 +87,7 @@ namespace arithm
        double beta;
        double gamma;

-        AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}
+        __host__ AddWeighted_(double alpha_, double beta_, double gamma_) : alpha(alpha_), beta(beta_), gamma(gamma_) {}

        __device__ __forceinline__ D operator ()(T1 a, T2 b) const
        {
--- a/modules/gpuarithm/src/cuda/cmp_mat.cu
+++ b/modules/gpuarithm/src/cuda/cmp_mat.cu
@@ -62,8 +62,8 @@ namespace arithm
            return vcmpeq4(a, b);
        }

-        __device__ __forceinline__ VCmpEq4() {}
-        __device__ __forceinline__ VCmpEq4(const VCmpEq4& other) {}
+        __host__ __device__ __forceinline__ VCmpEq4() {}
+        __host__ __device__ __forceinline__ VCmpEq4(const VCmpEq4&) {}
    };
    struct VCmpNe4 : binary_function<uint, uint, uint>
    {
@@ -72,8 +72,8 @@ namespace arithm
            return vcmpne4(a, b);
        }

-        __device__ __forceinline__ VCmpNe4() {}
-        __device__ __forceinline__ VCmpNe4(const VCmpNe4& other) {}
+        __host__ __device__ __forceinline__ VCmpNe4() {}
+        __host__ __device__ __forceinline__ VCmpNe4(const VCmpNe4&) {}
    };
    struct VCmpLt4 : binary_function<uint, uint, uint>
    {
@@ -82,8 +82,8 @@ namespace arithm
            return vcmplt4(a, b);
        }

-        __device__ __forceinline__ VCmpLt4() {}
-        __device__ __forceinline__ VCmpLt4(const VCmpLt4& other) {}
+        __host__ __device__ __forceinline__ VCmpLt4() {}
+        __host__ __device__ __forceinline__ VCmpLt4(const VCmpLt4&) {}
    };
    struct VCmpLe4 : binary_function<uint, uint, uint>
    {
@@ -92,8 +92,8 @@ namespace arithm
            return vcmple4(a, b);
        }

-        __device__ __forceinline__ VCmpLe4() {}
-        __device__ __forceinline__ VCmpLe4(const VCmpLe4& other) {}
+        __host__ __device__ __forceinline__ VCmpLe4() {}
+        __host__ __device__ __forceinline__ VCmpLe4(const VCmpLe4&) {}
    };

    template <class Op, typename T>
--- a/modules/gpuarithm/src/cuda/countnonzero.cu
+++ b/modules/gpuarithm/src/cuda/countnonzero.cu
@@ -45,6 +45,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/vec_traits.hpp"
 #include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/emulation.hpp"

--- a/modules/gpuarithm/src/cuda/div_inv.cu
+++ b/modules/gpuarithm/src/cuda/div_inv.cu
@@ -59,7 +59,7 @@ namespace arithm
    {
        S val;

-        explicit DivInv(S val_) : val(val_) {}
+        __host__ explicit DivInv(S val_) : val(val_) {}

        __device__ __forceinline__ D operator ()(T a) const
        {
--- a/modules/gpuarithm/src/cuda/div_mat.cu
+++ b/modules/gpuarithm/src/cuda/div_mat.cu
@@ -91,8 +91,8 @@ namespace arithm
            return b != 0 ? saturate_cast<D>(a / b) : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, float> : binary_function<T, T, float>
    {
@@ -101,8 +101,8 @@ namespace arithm
            return b != 0 ? static_cast<float>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };
    template <typename T> struct Div<T, double> : binary_function<T, T, double>
    {
@@ -111,15 +111,15 @@ namespace arithm
            return b != 0 ? static_cast<double>(a) / b : 0;
        }

-        __device__ __forceinline__ Div() {}
-        __device__ __forceinline__ Div(const Div& other) {}
+        __host__ __device__ __forceinline__ Div() {}
+        __host__ __device__ __forceinline__ Div(const Div&) {}
    };

    template <typename T, typename S, typename D> struct DivScale : binary_function<T, T, D>
    {
        S scale;

-        explicit DivScale(S scale_) : scale(scale_) {}
+        __host__ explicit DivScale(S scale_) : scale(scale_) {}

        __device__ __forceinline__ D operator ()(T a, T b) const
        {
--- a/modules/gpuarithm/src/cuda/div_scalar.cu
+++ b/modules/gpuarithm/src/cuda/div_scalar.cu
@@ -59,7 +59,7 @@ namespace arithm
    {
        S val;

-        explicit DivScalar(S val_) : val(val_) {}
+        __host__ explicit DivScalar(S val_) : val(val_) {}

        __device__ __forceinline__ D operator ()(T a) const
        {
--- a/modules/gpuarithm/src/cuda/math.cu
+++ b/modules/gpuarithm/src/cuda/math.cu
@@ -94,8 +94,8 @@ namespace arithm
            return saturate_cast<T>(x * x);
        }

-        __device__ __forceinline__ Sqr() {}
-        __device__ __forceinline__ Sqr(const Sqr& other) {}
+        __host__ __device__ __forceinline__ Sqr() {}
+        __host__ __device__ __forceinline__ Sqr(const Sqr&) {}
    };
 }

@@ -190,8 +190,8 @@ namespace arithm
            return saturate_cast<T>(f(x));
        }

-        __device__ __forceinline__ Exp() {}
-        __device__ __forceinline__ Exp(const Exp& other) {}
+        __host__ __device__ __forceinline__ Exp() {}
+        __host__ __device__ __forceinline__ Exp(const Exp&) {}
    };
 }

@@ -228,7 +228,7 @@ namespace arithm
    {
        float power;

-        PowOp(double power_) : power(static_cast<float>(power_)) {}
+        __host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}

        __device__ __forceinline__ T operator()(T e) const
        {
@@ -239,7 +239,7 @@ namespace arithm
    {
        float power;

-        PowOp(double power_) : power(static_cast<float>(power_)) {}
+        __host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}

        __device__ __forceinline__ T operator()(T e) const
        {
@@ -255,7 +255,7 @@ namespace arithm
    {
        float power;

-        PowOp(double power_) : power(static_cast<float>(power_)) {}
+        __host__ explicit PowOp(double power_) : power(static_cast<float>(power_)) {}

        __device__ __forceinline__ float operator()(float e) const
        {
@@ -266,7 +266,7 @@ namespace arithm
    {
        double power;

-        PowOp(double power_) : power(power_) {}
+        __host__ explicit PowOp(double power_) : power(power_) {}

        __device__ __forceinline__ double operator()(double e) const
        {
--- a/modules/gpuarithm/src/cuda/minmax.cu
+++ b/modules/gpuarithm/src/cuda/minmax.cu
@@ -45,6 +45,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/vec_traits.hpp"
 #include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
 #include "opencv2/core/cuda/limits.hpp"
--- a/modules/gpuarithm/src/cuda/minmax_mat.cu
+++ b/modules/gpuarithm/src/cuda/minmax_mat.cu
@@ -65,8 +65,8 @@ namespace arithm
            return vmin4(a, b);
        }

-        __device__ __forceinline__ VMin4() {}
-        __device__ __forceinline__ VMin4(const VMin4& other) {}
+        __host__ __device__ __forceinline__ VMin4() {}
+        __host__ __device__ __forceinline__ VMin4(const VMin4&) {}
    };

    struct VMin2 : binary_function<uint, uint, uint>
@@ -76,8 +76,8 @@ namespace arithm
            return vmin2(a, b);
        }

-        __device__ __forceinline__ VMin2() {}
-        __device__ __forceinline__ VMin2(const VMin2& other) {}
+        __host__ __device__ __forceinline__ VMin2() {}
+        __host__ __device__ __forceinline__ VMin2(const VMin2&) {}
    };
 }

@@ -151,8 +151,8 @@ namespace arithm
            return vmax4(a, b);
        }

-        __device__ __forceinline__ VMax4() {}
-        __device__ __forceinline__ VMax4(const VMax4& other) {}
+        __host__ __device__ __forceinline__ VMax4() {}
+        __host__ __device__ __forceinline__ VMax4(const VMax4&) {}
    };

    struct VMax2 : binary_function<uint, uint, uint>
@@ -162,8 +162,8 @@ namespace arithm
            return vmax2(a, b);
        }

-        __device__ __forceinline__ VMax2() {}
-        __device__ __forceinline__ VMax2(const VMax2& other) {}
+        __host__ __device__ __forceinline__ VMax2() {}
+        __host__ __device__ __forceinline__ VMax2(const VMax2&) {}
    };
 }

--- a/modules/gpuarithm/src/cuda/minmaxloc.cu
+++ b/modules/gpuarithm/src/cuda/minmaxloc.cu
@@ -45,6 +45,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/vec_traits.hpp"
 #include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
 #include "opencv2/core/cuda/limits.hpp"
--- a/modules/gpuarithm/src/cuda/mul_mat.cu
+++ b/modules/gpuarithm/src/cuda/mul_mat.cu
@@ -69,8 +69,8 @@ namespace arithm
            return res;
        }

-        __device__ __forceinline__ Mul_8uc4_32f() {}
-        __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_8uc4_32f(const Mul_8uc4_32f&) {}
    };

    struct Mul_16sc4_32f : binary_function<short4, float, short4>
@@ -81,8 +81,8 @@ namespace arithm
                               saturate_cast<short>(a.z * b), saturate_cast<short>(a.w * b));
        }

-        __device__ __forceinline__ Mul_16sc4_32f() {}
-        __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f& other) {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f() {}
+        __host__ __device__ __forceinline__ Mul_16sc4_32f(const Mul_16sc4_32f&) {}
    };

    template <typename T, typename D> struct Mul : binary_function<T, T, D>
@@ -92,15 +92,15 @@ namespace arithm
            return saturate_cast<D>(a * b);
        }

-        __device__ __forceinline__ Mul() {}
-        __device__ __forceinline__ Mul(const Mul& other) {}
+        __host__ __device__ __forceinline__ Mul() {}
+        __host__ __device__ __forceinline__ Mul(const Mul&) {}
    };

    template <typename T, typename S, typename D> struct MulScale : binary_function<T, T, D>
    {
        S scale;

-        explicit MulScale(S scale_) : scale(scale_) {}
+        __host__ explicit MulScale(S scale_) : scale(scale_) {}

        __device__ __forceinline__ D operator ()(T a, T b) const
        {
--- a/modules/gpuarithm/src/cuda/mul_scalar.cu
+++ b/modules/gpuarithm/src/cuda/mul_scalar.cu
@@ -59,7 +59,7 @@ namespace arithm
    {
        S val;

-        explicit MulScalar(S val_) : val(val_) {}
+        __host__ explicit MulScalar(S val_) : val(val_) {}

        __device__ __forceinline__ D operator ()(T a) const
        {
--- a/modules/gpuarithm/src/cuda/reduce.cu
+++ b/modules/gpuarithm/src/cuda/reduce.cu
@@ -46,6 +46,7 @@
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/vec_traits.hpp"
 #include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/limits.hpp"

@@ -76,8 +77,8 @@ namespace reduce
            return r;
        }

-        __device__ __forceinline__ Sum() {}
-        __device__ __forceinline__ Sum(const Sum&) {}
+        __host__ __device__ __forceinline__ Sum() {}
+        __host__ __device__ __forceinline__ Sum(const Sum&) {}
    };

    struct Avg
@@ -100,8 +101,8 @@ namespace reduce
            return r / sz;
        }

-        __device__ __forceinline__ Avg() {}
-        __device__ __forceinline__ Avg(const Avg&) {}
+        __host__ __device__ __forceinline__ Avg() {}
+        __host__ __device__ __forceinline__ Avg(const Avg&) {}
    };

    struct Min
@@ -125,8 +126,8 @@ namespace reduce
            return r;
        }

-        __device__ __forceinline__ Min() {}
-        __device__ __forceinline__ Min(const Min&) {}
+        __host__ __device__ __forceinline__ Min() {}
+        __host__ __device__ __forceinline__ Min(const Min&) {}
    };

    struct Max
@@ -150,8 +151,8 @@ namespace reduce
            return r;
        }

-        __device__ __forceinline__ Max() {}
-        __device__ __forceinline__ Max(const Max&) {}
+        __host__ __device__ __forceinline__ Max() {}
+        __host__ __device__ __forceinline__ Max(const Max&) {}
    };

    ///////////////////////////////////////////////////////////
--- a/modules/gpuarithm/src/cuda/sub_mat.cu
+++ b/modules/gpuarithm/src/cuda/sub_mat.cu
@@ -62,8 +62,8 @@ namespace arithm
            return vsub4(a, b);
        }

-        __device__ __forceinline__ VSub4() {}
-        __device__ __forceinline__ VSub4(const VSub4& other) {}
+        __host__ __device__ __forceinline__ VSub4() {}
+        __host__ __device__ __forceinline__ VSub4(const VSub4&) {}
    };

    struct VSub2 : binary_function<uint, uint, uint>
@@ -73,8 +73,8 @@ namespace arithm
            return vsub2(a, b);
        }

-        __device__ __forceinline__ VSub2() {}
-        __device__ __forceinline__ VSub2(const VSub2& other) {}
+        __host__ __device__ __forceinline__ VSub2() {}
+        __host__ __device__ __forceinline__ VSub2(const VSub2&) {}
    };

    template <typename T, typename D> struct SubMat : binary_function<T, T, D>
@@ -84,8 +84,8 @@ namespace arithm
            return saturate_cast<D>(a - b);
        }

-        __device__ __forceinline__ SubMat() {}
-        __device__ __forceinline__ SubMat(const SubMat& other) {}
+        __host__ __device__ __forceinline__ SubMat() {}
+        __host__ __device__ __forceinline__ SubMat(const SubMat&) {}
    };
 }

--- a/modules/gpuarithm/src/cuda/sub_scalar.cu
+++ b/modules/gpuarithm/src/cuda/sub_scalar.cu
@@ -59,7 +59,7 @@ namespace arithm
    {
        S val;

-        explicit SubScalar(S val_) : val(val_) {}
+        __host__ explicit SubScalar(S val_) : val(val_) {}

        __device__ __forceinline__ D operator ()(T a) const
        {
--- a/modules/gpuarithm/src/cuda/sum.cu
+++ b/modules/gpuarithm/src/cuda/sum.cu
@@ -45,6 +45,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/vec_traits.hpp"
 #include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
 #include "opencv2/core/cuda/utility.hpp"
--- a/modules/gpufilters/perf/perf_filters.cpp
+++ b/modules/gpufilters/perf/perf_filters.cpp
@@ -72,7 +72,7 @@ PERF_TEST_P(Sz_Type_KernelSz, Blur,

        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));

-        GPU_SANITY_CHECK(dst);
+        GPU_SANITY_CHECK(dst, 1);
    }
    else
    {
--- a/modules/gpuimgproc/src/cuda/hough.cu
+++ b/modules/gpuimgproc/src/cuda/hough.cu
@@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/emulation.hpp"
 #include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/dynamic_smem.hpp"

@@ -811,7 +812,7 @@ namespace cv { namespace gpu { namespace cudev

            const int ind = ::atomicAdd(r_sizes + n, 1);
            if (ind < maxSize)
-                r_table(n, ind) = p - templCenter;
+                r_table(n, ind) = saturate_cast<short2>(p - templCenter);
        }

        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
@@ -855,7 +856,7 @@ namespace cv { namespace gpu { namespace cudev

            for (int j = 0; j < r_row_size; ++j)
            {
-                short2 c = p - r_row[j];
+                int2 c = p - r_row[j];

                c.x = __float2int_rn(c.x * idp);
                c.y = __float2int_rn(c.y * idp);
--- a/modules/gpuoptflow/perf/perf_optflow.cpp
+++ b/modules/gpuoptflow/perf/perf_optflow.cpp
@@ -84,7 +84,7 @@ PERF_TEST_P(ImagePair, InterpolateFrames,

        TEST_CYCLE() cv::gpu::interpolateFrames(d_frame0, d_frame1, d_fu, d_fv, d_bu, d_bv, 0.5f, newFrame, d_buf);

-        GPU_SANITY_CHECK(newFrame);
+        GPU_SANITY_CHECK(newFrame, 1e-4);
    }
    else
    {
@@ -123,7 +123,7 @@ PERF_TEST_P(ImagePair, CreateOpticalFlowNeedleMap,

        TEST_CYCLE() cv::gpu::createOpticalFlowNeedleMap(u, v, vertex, colors);

-        GPU_SANITY_CHECK(vertex);
+        GPU_SANITY_CHECK(vertex, 1e-6);
        GPU_SANITY_CHECK(colors);
    }
    else
@@ -161,8 +161,8 @@ PERF_TEST_P(ImagePair, BroxOpticalFlow,

        TEST_CYCLE() d_flow(d_frame0, d_frame1, u, v);

-        GPU_SANITY_CHECK(u);
-        GPU_SANITY_CHECK(v);
+        GPU_SANITY_CHECK(u, 1e-1);
+        GPU_SANITY_CHECK(v, 1e-1);
    }
    else
    {
--- a/modules/gpuoptflow/test/test_optflow.cpp
+++ b/modules/gpuoptflow/test/test_optflow.cpp
@@ -103,8 +103,8 @@ GPU_TEST_P(BroxOpticalFlow, Regression)
    for (int i = 0; i < v_gold.rows; ++i)
        f.read(v_gold.ptr<char>(i), v_gold.cols * sizeof(float));

-    EXPECT_MAT_NEAR(u_gold, u, 0);
-    EXPECT_MAT_NEAR(v_gold, v, 0);
+    EXPECT_MAT_SIMILAR(u_gold, u, 1e-3);
+    EXPECT_MAT_SIMILAR(v_gold, v, 1e-3);
 #else
    std::ofstream f(fname.c_str(), std::ios_base::binary);