added gpu::compare with scalars (Feature #1754)

2012-05-12 09:45:21 +00:00 · 2012-05-12 09:45:21 +00:00 · 9df6e51a5b
commit 9df6e51a5b
parent abe16352d6
4 changed files with 410 additions and 98 deletions
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@ -527,6 +527,7 @@ CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream

 //! compares elements of two arrays (c = a <cmpop> b)
 CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
+CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& dst, int cmpop, Stream& stream = Stream::Null());

 //! performs per-elements bit-wise inversion
 CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
@ -1371,7 +1372,7 @@ private:
    std::vector<GpuMat> trainDescCollection;
 };

-template <class Distance> 
+template <class Distance>
 class CV_EXPORTS BruteForceMatcher_GPU;

 template <typename T>
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -1158,130 +1158,279 @@ namespace cv { namespace gpu { namespace device
    //////////////////////////////////////////////////////////////////////////////////////
    // Compare

-    template <typename T> struct Equal : binary_function<T, T, uchar>
+#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
+
+    template <template <typename> class Op, typename T, int cn> struct Compare;
+    template <template <typename> class Op, typename T>
+    struct Compare<Op, T, 1>: binary_function<T, T, uchar>
    {
        __device__ __forceinline__ uchar operator()(T src1, T src2) const
        {
-            return static_cast<uchar>((src1 == src2) * 255);
+            Op<T> op;
+            return static_cast<uchar>(static_cast<int>(op(src1, src2)) * 255);
        }
    };
-    template <typename T> struct NotEqual : binary_function<T, T, uchar>
+    template <template <typename> class Op, typename T>
+    struct Compare<Op, T, 2>: binary_function<TYPE_VEC(T, 2), TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
    {
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src1, const TYPE_VEC(T, 2) & src2) const
        {
-            return static_cast<uchar>((src1 != src2) * 255);
+            Op<T> op;
+            return VecTraits<TYPE_VEC(uchar, 2)>::make(
+                        static_cast<uchar>(static_cast<int>(op(src1.x, src2.x)) * 255),
+                        static_cast<uchar>(static_cast<int>(op(src1.y, src2.y)) * 255));
        }
    };
-    template <typename T> struct Less : binary_function<T, T, uchar>
+    template <template <typename> class Op, typename T>
+    struct Compare<Op, T, 3>: binary_function<TYPE_VEC(T, 3), TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
    {
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src1, const TYPE_VEC(T, 3) & src2) const
        {
-            return static_cast<uchar>((src1 < src2) * 255);
+            Op<T> op;
+            return VecTraits<TYPE_VEC(uchar, 3)>::make(
+                        static_cast<uchar>(static_cast<int>(op(src1.x, src2.x)) * 255),
+                        static_cast<uchar>(static_cast<int>(op(src1.y, src2.y)) * 255),
+                        static_cast<uchar>(static_cast<int>(op(src1.z, src2.z)) * 255));
        }
    };
-    template <typename T> struct LessEqual : binary_function<T, T, uchar>
+    template <template <typename> class Op, typename T>
+    struct Compare<Op, T, 4>: binary_function<TYPE_VEC(T, 4), TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
    {
-        __device__ __forceinline__ uchar operator()(T src1, T src2) const
+        __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src1, const TYPE_VEC(T, 4) & src2) const
        {
-            return static_cast<uchar>((src1 <= src2) * 255);
+            Op<T> op;
+            return VecTraits<TYPE_VEC(uchar, 4)>::make(
+                        static_cast<uchar>(static_cast<int>(op(src1.x, src2.x)) * 255),
+                        static_cast<uchar>(static_cast<int>(op(src1.y, src2.y)) * 255),
+                        static_cast<uchar>(static_cast<int>(op(src1.z, src2.z)) * 255),
+                        static_cast<uchar>(static_cast<int>(op(src1.w, src2.w)) * 255));
        }
    };

-    template <> struct TransformFunctorTraits< Equal<int> > : DefaultTransformFunctorTraits< Equal<int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Equal<float> > : DefaultTransformFunctorTraits< Equal<float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< NotEqual<int> > : DefaultTransformFunctorTraits< NotEqual<int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< NotEqual<float> > : DefaultTransformFunctorTraits< NotEqual<float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Less<int> > : DefaultTransformFunctorTraits< Less<int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< Less<float> > : DefaultTransformFunctorTraits< Less<float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< LessEqual<int> > : DefaultTransformFunctorTraits< LessEqual<int> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits< LessEqual<float> > : DefaultTransformFunctorTraits< LessEqual<float> >
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
+#undef TYPE_VEC
+
+#define IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(op, type, block_dim_y, shift) \
+    template <> struct TransformFunctorTraits< Compare<op, type, 1> > : DefaultTransformFunctorTraits< Compare<op, type, 1> > \
+    { \
+        enum { smart_block_dim_y = block_dim_y }; \
+        enum { smart_shift = shift }; \
    };

-    template <template <typename> class Op, typename T> void compare(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, int, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(equal_to, float, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(not_equal_to, int, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(not_equal_to, float, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater, int, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater, float, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less, int, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less, float, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater_equal, int, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(greater_equal, float, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, int, 8, 4)
+    IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS(less_equal, float, 8, 4)
+
+#undef IMPLEMENT_COMPARE_TRANSFORM_FUNCTOR_TRAITS
+
+    template <template <typename> class Op, typename T> void compare(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        Op<T> op;
+        Compare<Op, T, 1> op;
        cv::gpu::device::transform(static_cast< DevMem2D_<T> >(src1), static_cast< DevMem2D_<T> >(src2), dst, op, WithOutMask(), stream);
    }

-    template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void compare_eq(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        compare<Equal, T>(src1, src2, dst, stream);
+        compare<equal_to, T>(src1, src2, dst, stream);
    }
-    template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void compare_ne(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        compare<NotEqual, T>(src1, src2, dst, stream);
+        compare<not_equal_to, T>(src1, src2, dst, stream);
    }
-    template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void compare_lt(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        compare<Less, T>(src1, src2, dst, stream);
+        compare<less, T>(src1, src2, dst, stream);
    }
-    template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream)
+    template <typename T> void compare_le(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream)
    {
-        compare<LessEqual, T>(src1, src2, dst, stream);
+        compare<less_equal, T>(src1, src2, dst, stream);
    }

-    template void compare_eq<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_eq<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_eq<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_eq<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_eq<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_eq<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_eq<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_eq<uchar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<schar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<ushort>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<short >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<int   >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<float >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<double>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);

-    template void compare_ne<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_ne<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_ne<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_ne<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_ne<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_ne<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_ne<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_ne<uchar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<schar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<ushort>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<short >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<int   >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<float >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<double>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);

-    template void compare_lt<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_lt<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_lt<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_lt<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_lt<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_lt<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_lt<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_lt<uchar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<schar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<ushort>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<short >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<int   >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<float >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<double>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);

-    template void compare_le<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_le<schar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_le<ushort>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_le<short >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_le<int   >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_le<float >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void compare_le<double>(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template void compare_le<uchar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<schar >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<ushort>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<short >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<int   >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<float >(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<double>(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+
+    template <template <typename> class Op, typename T, int cn> void compare(DevMem2Db src, double val[4], DevMem2Db dst, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_t;
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
+        src_t val1 = VecTraits<src_t>::make(sval);
+
+        Compare<Op, T, cn> op;
+
+        cv::gpu::device::transform(static_cast< DevMem2D_<src_t> >(src), static_cast< DevMem2D_<dst_t> >(dst), cv::gpu::device::bind2nd(op, val1), WithOutMask(), stream);
+    }
+
+    template <typename T> void compare_eq(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(DevMem2Db src, double val[4], DevMem2Db dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            compare<equal_to, T, 1>,
+            compare<equal_to, T, 2>,
+            compare<equal_to, T, 3>,
+            compare<equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void compare_ne(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(DevMem2Db src, double val[4], DevMem2Db dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            compare<not_equal_to, T, 1>,
+            compare<not_equal_to, T, 2>,
+            compare<not_equal_to, T, 3>,
+            compare<not_equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void compare_lt(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(DevMem2Db src, double val[4], DevMem2Db dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            compare<less, T, 1>,
+            compare<less, T, 2>,
+            compare<less, T, 3>,
+            compare<less, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void compare_le(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(DevMem2Db src, double val[4], DevMem2Db dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            compare<less_equal, T, 1>,
+            compare<less_equal, T, 2>,
+            compare<less_equal, T, 3>,
+            compare<less_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void compare_gt(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(DevMem2Db src, double val[4], DevMem2Db dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            compare<greater, T, 1>,
+            compare<greater, T, 2>,
+            compare<greater, T, 3>,
+            compare<greater, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void compare_ge(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(DevMem2Db src, double val[4], DevMem2Db dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            compare<greater_equal, T, 1>,
+            compare<greater_equal, T, 2>,
+            compare<greater_equal, T, 3>,
+            compare<greater_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+
+    template void compare_eq<uchar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<schar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<ushort>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<short >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<int   >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<float >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_eq<double>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+
+    template void compare_ne<uchar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<schar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<ushort>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<short >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<int   >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<float >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ne<double>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+
+    template void compare_lt<uchar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<schar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<ushort>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<short >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<int   >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<float >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_lt<double>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+
+    template void compare_le<uchar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<schar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<ushort>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<short >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<int   >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<float >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_le<double>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+
+    template void compare_gt<uchar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_gt<schar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_gt<ushort>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_gt<short >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_gt<int   >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_gt<float >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_gt<double>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+
+    template void compare_ge<uchar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ge<schar >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ge<ushort>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ge<short >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ge<int   >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ge<float >(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template void compare_ge<double>(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);


    //////////////////////////////////////////////////////////////////////////
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -64,6 +64,7 @@ void cv::gpu::sqrt(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
+void cv::gpu::compare(const GpuMat&, Scalar, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
@ -1357,17 +1358,24 @@ void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream)

 namespace cv { namespace gpu { namespace device
 {
-    template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    template <typename T> void compare_eq(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_ne(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_lt(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_le(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
+
+    template <typename T> void compare_eq(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_ne(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_lt(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_le(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_gt(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    template <typename T> void compare_ge(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
 }}}

 void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
 {
    using namespace cv::gpu::device;

-    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+    typedef void (*func_t)(DevMem2Db src1, DevMem2Db src2, DevMem2Db dst, cudaStream_t stream);
    static const func_t funcs[7][4] =
    {
        {compare_eq<unsigned char> , compare_ne<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> },
@ -1407,6 +1415,57 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
    funcs[src1.depth()][codes[cmpop]](psrc1[cmpop]->reshape(1), psrc2[cmpop]->reshape(1), dst.reshape(1), StreamAccessor::getStream(stream));
 }

+namespace
+{
+    template <typename T>
+    void castScalar(Scalar& sc)
+    {
+        sc.val[0] = saturate_cast<T>(sc.val[0]);
+        sc.val[1] = saturate_cast<T>(sc.val[1]);
+        sc.val[2] = saturate_cast<T>(sc.val[2]);
+        sc.val[3] = saturate_cast<T>(sc.val[3]);
+    }
+}
+
+void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
+{
+    using namespace cv::gpu::device;
+
+    typedef void (*func_t)(DevMem2Db src, int cn, double val[4], DevMem2Db dst, cudaStream_t stream);
+    static const func_t funcs[7][6] =
+    {
+        {compare_eq<unsigned char> , compare_gt<unsigned char> , compare_ge<unsigned char> , compare_lt<unsigned char> , compare_le<unsigned char> , compare_ne<unsigned char> },
+        {compare_eq<signed char>   , compare_gt<signed char>   , compare_ge<signed char>   , compare_lt<signed char>   , compare_le<signed char>   , compare_ne<signed char>   },
+        {compare_eq<unsigned short>, compare_gt<unsigned short>, compare_ge<unsigned short>, compare_lt<unsigned short>, compare_le<unsigned short>, compare_ne<unsigned short>},
+        {compare_eq<short>         , compare_gt<short>         , compare_ge<short>         , compare_lt<short>         , compare_le<short>         , compare_ne<short>         },
+        {compare_eq<int>           , compare_gt<int>           , compare_ge<int>           , compare_lt<int>           , compare_le<int>           , compare_ne<int>           },
+        {compare_eq<float>         , compare_gt<float>         , compare_ge<float>         , compare_lt<float>         , compare_le<float>         , compare_ne<float>         },
+        {compare_eq<double>        , compare_gt<double>        , compare_ge<double>        , compare_lt<double>        , compare_le<double>        , compare_ne<double>        }
+    };
+
+    typedef void (*cast_func_t)(Scalar& sc);
+    static const cast_func_t cast_func[] =
+    {
+        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+    };
+
+    CV_Assert(src.depth() <= CV_64F);
+    CV_Assert(src.channels() <= 4);
+    CV_Assert(cmpop >= CMP_EQ && cmpop <= CMP_NE);
+
+    if (src.depth() == CV_64F)
+    {
+        if (!TargetArchs::builtWith(NATIVE_DOUBLE) || !DeviceInfo().supports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, src.channels()));
+
+    cast_func[src.depth()](sc);
+
+    funcs[src.depth()][cmpop](src, src.channels(), sc.val, dst, StreamAccessor::getStream(stream));
+}
+

 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@ -1480,12 +1480,12 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
    WHOLE_SUBMAT));

 ////////////////////////////////////////////////////////////////////////////////
-// compare
+// Compare_Array

 CV_ENUM(CmpCode, cv::CMP_EQ, cv::CMP_GT, cv::CMP_GE, cv::CMP_LT, cv::CMP_LE, cv::CMP_NE)
 #define ALL_CMP_CODES testing::Values(CmpCode(cv::CMP_EQ), CmpCode(cv::CMP_NE), CmpCode(cv::CMP_GT), CmpCode(cv::CMP_GE), CmpCode(cv::CMP_LT), CmpCode(cv::CMP_LE))

-PARAM_TEST_CASE(Compare, cv::gpu::DeviceInfo, cv::Size, MatDepth, CmpCode, UseRoi)
+PARAM_TEST_CASE(Compare_Array, cv::gpu::DeviceInfo, cv::Size, MatDepth, CmpCode, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    cv::Size size;
@ -1505,7 +1505,7 @@ PARAM_TEST_CASE(Compare, cv::gpu::DeviceInfo, cv::Size, MatDepth, CmpCode, UseRo
    }
 };

-TEST_P(Compare, Accuracy)
+TEST_P(Compare_Array, Accuracy)
 {
    cv::Mat src1 = randomMat(size, depth);
    cv::Mat src2 = randomMat(size, depth);
@ -1534,13 +1534,116 @@ TEST_P(Compare, Accuracy)
    }
 }

-INSTANTIATE_TEST_CASE_P(GPU_Core, Compare, testing::Combine(
+INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Array, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_DEPTH,
    ALL_CMP_CODES,
    WHOLE_SUBMAT));

+////////////////////////////////////////////////////////////////////////////////
+// Compare_Scalar
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void compareScalarImpl(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst)
+    {
+        Op<T> op;
+
+        const int cn = src.channels();
+
+        dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                {
+                    T src_val = src.at<T>(y, x * cn + c);
+                    T sc_val = cv::saturate_cast<T>(sc.val[c]);
+                    dst.at<uchar>(y, x * cn + c) = static_cast<uchar>(static_cast<int>(op(src_val, sc_val)) * 255);
+                }
+            }
+        }
+    }
+
+    void compareScalarGold(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst, int cmpop)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst);
+        static const func_t funcs[7][6] =
+        {
+            {compareScalarImpl<std::equal_to, unsigned char> , compareScalarImpl<std::greater, unsigned char> , compareScalarImpl<std::greater_equal, unsigned char> , compareScalarImpl<std::less, unsigned char> , compareScalarImpl<std::less_equal, unsigned char> , compareScalarImpl<std::not_equal_to, unsigned char> },
+            {compareScalarImpl<std::equal_to, signed char>   , compareScalarImpl<std::greater, signed char>   , compareScalarImpl<std::greater_equal, signed char>   , compareScalarImpl<std::less, signed char>   , compareScalarImpl<std::less_equal, signed char>   , compareScalarImpl<std::not_equal_to, signed char>   },
+            {compareScalarImpl<std::equal_to, unsigned short>, compareScalarImpl<std::greater, unsigned short>, compareScalarImpl<std::greater_equal, unsigned short>, compareScalarImpl<std::less, unsigned short>, compareScalarImpl<std::less_equal, unsigned short>, compareScalarImpl<std::not_equal_to, unsigned short>},
+            {compareScalarImpl<std::equal_to, short>         , compareScalarImpl<std::greater, short>         , compareScalarImpl<std::greater_equal, short>         , compareScalarImpl<std::less, short>         , compareScalarImpl<std::less_equal, short>         , compareScalarImpl<std::not_equal_to, short>         },
+            {compareScalarImpl<std::equal_to, int>           , compareScalarImpl<std::greater, int>           , compareScalarImpl<std::greater_equal, int>           , compareScalarImpl<std::less, int>           , compareScalarImpl<std::less_equal, int>           , compareScalarImpl<std::not_equal_to, int>           },
+            {compareScalarImpl<std::equal_to, float>         , compareScalarImpl<std::greater, float>         , compareScalarImpl<std::greater_equal, float>         , compareScalarImpl<std::less, float>         , compareScalarImpl<std::less_equal, float>         , compareScalarImpl<std::not_equal_to, float>         },
+            {compareScalarImpl<std::equal_to, double>        , compareScalarImpl<std::greater, double>        , compareScalarImpl<std::greater_equal, double>        , compareScalarImpl<std::less, double>        , compareScalarImpl<std::less_equal, double>        , compareScalarImpl<std::not_equal_to, double>        }
+        };
+
+        funcs[src.depth()][cmpop](src, sc, dst);
+    }
+}
+
+PARAM_TEST_CASE(Compare_Scalar, cv::gpu::DeviceInfo, cv::Size, MatType, CmpCode, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int cmp_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        cmp_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+TEST_P(Compare_Scalar, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar sc = randomScalar(0.0, 255.0);
+
+    if (src.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::compare(loadMat(src), sc, dst, cmp_code);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(CV_8U, src.channels()), useRoi);
+
+        cv::gpu::compare(loadMat(src, useRoi), sc, dst, cmp_code);
+
+        cv::Mat dst_gold;
+        compareScalarGold(src, sc, dst_gold, cmp_code);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_64F, 1, 4),
+    ALL_CMP_CODES,
+    WHOLE_SUBMAT));
+
 //////////////////////////////////////////////////////////////////////////////
 // Bitwise_Array