diff --git a/modules/gpu/doc/per_element_operations.rst b/modules/gpu/doc/per_element_operations.rst
index a59875e64..2670ba323 100644
--- a/modules/gpu/doc/per_element_operations.rst
+++ b/modules/gpu/doc/per_element_operations.rst
@@ -276,6 +276,8 @@ Compares elements of two matrices.
 
 .. ocv:function:: void gpu::compare( const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream=Stream::Null() )
 
+.. ocv:function:: void gpu::compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null())
+
     :param a: First source matrix.
 
     :param b: Second source matrix with the same size and type as ``a`` .
diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 7493d83b5..4eb339e7e 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -533,6 +533,7 @@ CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream
 
 //! compares elements of two arrays (c = a <cmpop> b)
 CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
+CV_EXPORTS void compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
 
 //! performs per-elements bit-wise inversion
 CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
diff --git a/modules/gpu/perf/perf_core.cpp b/modules/gpu/perf/perf_core.cpp
index cfd572dc1..c78bfd6e2 100644
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -647,6 +647,39 @@ PERF_TEST_P(Sz_Depth_Code, Core_CompareMat, Combine(GPU_TYPICAL_MAT_SIZES, ARITH
     }
 }
 
+//////////////////////////////////////////////////////////////////////
+// CompareScalar
+
+PERF_TEST_P(Sz_Depth_Code, Core_CompareScalar, Combine(GPU_TYPICAL_MAT_SIZES, ARITHM_MAT_DEPTH, ALL_CMP_CODES))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int cmp_code = GET_PARAM(2);
+
+    cv::Mat src(size, depth);
+    fillRandom(src);
+
+    cv::Scalar s = cv::Scalar::all(100);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        TEST_CYCLE() cv::gpu::compare(d_src, s, d_dst, cmp_code);
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::compare(src, s, dst, cmp_code);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // BitwiseNot
 
diff --git a/modules/gpu/src/cuda/element_operations.cu b/modules/gpu/src/cuda/element_operations.cu
index 4b52cc7dd..27fb61ff7 100644
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -1954,6 +1954,226 @@ namespace arithm
     template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
 }
 
+//////////////////////////////////////////////////////////////////////////////////////
+// cmpScalar
+
+namespace arithm
+{
+#define TYPE_VEC(type, cn) typename TypeVec<type, cn>::vec_type
+
+    template <class Op, typename T, int cn> struct CmpScalar;
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 1> : unary_function<T, uchar>
+    {
+        const T val;
+
+        __host__ explicit CmpScalar(T val_) : val(val_) {}
+
+        __device__ __forceinline__ uchar operator()(T src) const
+        {
+            Cmp<Op, T> op;
+            return op(src, val);
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 2> : unary_function<TYPE_VEC(T, 2), TYPE_VEC(uchar, 2)>
+    {
+        const TYPE_VEC(T, 2) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 2) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 2) operator()(const TYPE_VEC(T, 2) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 2)>::make(op(src.x, val.x), op(src.y, val.y));
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 3> : unary_function<TYPE_VEC(T, 3), TYPE_VEC(uchar, 3)>
+    {
+        const TYPE_VEC(T, 3) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 3) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 3) operator()(const TYPE_VEC(T, 3) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 3)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z));
+        }
+    };
+    template <class Op, typename T>
+    struct CmpScalar<Op, T, 4> : unary_function<TYPE_VEC(T, 4), TYPE_VEC(uchar, 4)>
+    {
+        const TYPE_VEC(T, 4) val;
+
+        __host__ explicit CmpScalar(TYPE_VEC(T, 4) val_) : val(val_) {}
+
+        __device__ __forceinline__ TYPE_VEC(uchar, 4) operator()(const TYPE_VEC(T, 4) & src) const
+        {
+            Cmp<Op, T> op;
+            return VecTraits<TYPE_VEC(uchar, 4)>::make(op(src.x, val.x), op(src.y, val.y), op(src.z, val.z), op(src.w, val.w));
+        }
+    };
+
+#undef TYPE_VEC
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    template <class Op, typename T> struct TransformFunctorTraits< arithm::CmpScalar<Op, T, 1> > : arithm::ArithmFuncTraits<sizeof(T), sizeof(uchar)>
+    {
+    };
+}}}
+
+namespace arithm
+{
+    template <template <typename> class Op, typename T, int cn>
+    void cmpScalar(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef typename TypeVec<T, cn>::vec_type src_t;
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        T sval[] = {static_cast<T>(val[0]), static_cast<T>(val[1]), static_cast<T>(val[2]), static_cast<T>(val[3])};
+        src_t val1 = VecTraits<src_t>::make(sval);
+
+        CmpScalar<Op<T>, T, cn> op(val1);
+        transform((PtrStepSz<src_t>) src, (PtrStepSz<dst_t>) dst, op, WithOutMask(), stream);
+    }
+
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<equal_to, T, 1>,
+            cmpScalar<equal_to, T, 2>,
+            cmpScalar<equal_to, T, 3>,
+            cmpScalar<equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<not_equal_to, T, 1>,
+            cmpScalar<not_equal_to, T, 2>,
+            cmpScalar<not_equal_to, T, 3>,
+            cmpScalar<not_equal_to, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<less, T, 1>,
+            cmpScalar<less, T, 2>,
+            cmpScalar<less, T, 3>,
+            cmpScalar<less, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<less_equal, T, 1>,
+            cmpScalar<less_equal, T, 2>,
+            cmpScalar<less_equal, T, 3>,
+            cmpScalar<less_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<greater, T, 1>,
+            cmpScalar<greater, T, 2>,
+            cmpScalar<greater, T, 3>,
+            cmpScalar<greater, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream)
+    {
+        typedef void (*func_t)(PtrStepSzb src, double val[4], PtrStepSzb dst, cudaStream_t stream);
+        static const func_t funcs[] =
+        {
+            0,
+            cmpScalar<greater_equal, T, 1>,
+            cmpScalar<greater_equal, T, 2>,
+            cmpScalar<greater_equal, T, 3>,
+            cmpScalar<greater_equal, T, 4>
+        };
+
+        funcs[cn](src, val, dst, stream);
+    }
+
+    template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+
+    template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
+
 //////////////////////////////////////////////////////////////////////////////////////
 // bitMat
 
diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp
index b34b5ca42..3d6cde377 100644
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -64,6 +64,7 @@ void cv::gpu::sqrt(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::compare(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
+void cv::gpu::compare(const GpuMat&, Scalar, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_not(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::bitwise_or(const GpuMat&, const Scalar&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -2001,6 +2002,69 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
     func(src1_, src2_, dst_, stream);
 }
 
+namespace arithm
+{
+    template <typename T> void cmpScalarEq(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarNe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarLe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGt(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    template <typename T> void cmpScalarGe(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+}
+
+namespace
+{
+    template <typename T> void castScalar(Scalar& sc)
+    {
+        sc.val[0] = saturate_cast<T>(sc.val[0]);
+        sc.val[1] = saturate_cast<T>(sc.val[1]);
+        sc.val[2] = saturate_cast<T>(sc.val[2]);
+        sc.val[3] = saturate_cast<T>(sc.val[3]);
+    }
+}
+
+void cv::gpu::compare(const GpuMat& src, Scalar sc, GpuMat& dst, int cmpop, Stream& stream)
+{
+    using namespace arithm;
+
+    typedef void (*func_t)(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+    static const func_t funcs[7][6] =
+    {
+        {cmpScalarEq<unsigned char> , cmpScalarGt<unsigned char> , cmpScalarGe<unsigned char> , cmpScalarLt<unsigned char> , cmpScalarLe<unsigned char> , cmpScalarNe<unsigned char> },
+        {cmpScalarEq<signed char>   , cmpScalarGt<signed char>   , cmpScalarGe<signed char>   , cmpScalarLt<signed char>   , cmpScalarLe<signed char>   , cmpScalarNe<signed char>   },
+        {cmpScalarEq<unsigned short>, cmpScalarGt<unsigned short>, cmpScalarGe<unsigned short>, cmpScalarLt<unsigned short>, cmpScalarLe<unsigned short>, cmpScalarNe<unsigned short>},
+        {cmpScalarEq<short>         , cmpScalarGt<short>         , cmpScalarGe<short>         , cmpScalarLt<short>         , cmpScalarLe<short>         , cmpScalarNe<short>         },
+        {cmpScalarEq<int>           , cmpScalarGt<int>           , cmpScalarGe<int>           , cmpScalarLt<int>           , cmpScalarLe<int>           , cmpScalarNe<int>           },
+        {cmpScalarEq<float>         , cmpScalarGt<float>         , cmpScalarGe<float>         , cmpScalarLt<float>         , cmpScalarLe<float>         , cmpScalarNe<float>         },
+        {cmpScalarEq<double>        , cmpScalarGt<double>        , cmpScalarGe<double>        , cmpScalarLt<double>        , cmpScalarLe<double>        , cmpScalarNe<double>        }
+    };
+
+    typedef void (*cast_func_t)(Scalar& sc);
+    static const cast_func_t cast_func[] =
+    {
+        castScalar<unsigned char>, castScalar<signed char>, castScalar<unsigned short>, castScalar<short>, castScalar<int>, castScalar<float>, castScalar<double>
+    };
+
+    const int depth = src.depth();
+    const int cn = src.channels();
+
+    CV_Assert( depth <= CV_64F );
+    CV_Assert( cn <= 4 );
+    CV_Assert( cmpop >= CMP_EQ && cmpop <= CMP_NE );
+
+    if (depth == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(CV_StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+    cast_func[depth](sc);
+
+    funcs[depth][cmpop](src, cn, sc.val, dst, StreamAccessor::getStream(stream));
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations
 
diff --git a/modules/gpu/test/test_core.cpp b/modules/gpu/test/test_core.cpp
index f3917d59b..eeea41e1d 100644
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1669,6 +1669,117 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Array, testing::Combine(
     ALL_CMP_CODES,
     WHOLE_SUBMAT));
 
+////////////////////////////////////////////////////////////////////////////////
+// Compare_Scalar
+
+namespace
+{
+    template <template <typename> class Op, typename T>
+    void compareScalarImpl(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst)
+    {
+        Op<T> op;
+
+        const int cn = src.channels();
+
+        dst.create(src.size(), CV_MAKE_TYPE(CV_8U, cn));
+
+        for (int y = 0; y < src.rows; ++y)
+        {
+            for (int x = 0; x < src.cols; ++x)
+            {
+                for (int c = 0; c < cn; ++c)
+                {
+                    T src_val = src.at<T>(y, x * cn + c);
+                    T sc_val = cv::saturate_cast<T>(sc.val[c]);
+                    dst.at<uchar>(y, x * cn + c) = static_cast<uchar>(static_cast<int>(op(src_val, sc_val)) * 255);
+                }
+            }
+        }
+    }
+
+    void compareScalarGold(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst, int cmpop)
+    {
+        typedef void (*func_t)(const cv::Mat& src, cv::Scalar sc, cv::Mat& dst);
+        static const func_t funcs[7][6] =
+        {
+            {compareScalarImpl<std::equal_to, unsigned char> , compareScalarImpl<std::greater, unsigned char> , compareScalarImpl<std::greater_equal, unsigned char> , compareScalarImpl<std::less, unsigned char> , compareScalarImpl<std::less_equal, unsigned char> , compareScalarImpl<std::not_equal_to, unsigned char> },
+            {compareScalarImpl<std::equal_to, signed char>   , compareScalarImpl<std::greater, signed char>   , compareScalarImpl<std::greater_equal, signed char>   , compareScalarImpl<std::less, signed char>   , compareScalarImpl<std::less_equal, signed char>   , compareScalarImpl<std::not_equal_to, signed char>   },
+            {compareScalarImpl<std::equal_to, unsigned short>, compareScalarImpl<std::greater, unsigned short>, compareScalarImpl<std::greater_equal, unsigned short>, compareScalarImpl<std::less, unsigned short>, compareScalarImpl<std::less_equal, unsigned short>, compareScalarImpl<std::not_equal_to, unsigned short>},
+            {compareScalarImpl<std::equal_to, short>         , compareScalarImpl<std::greater, short>         , compareScalarImpl<std::greater_equal, short>         , compareScalarImpl<std::less, short>         , compareScalarImpl<std::less_equal, short>         , compareScalarImpl<std::not_equal_to, short>         },
+            {compareScalarImpl<std::equal_to, int>           , compareScalarImpl<std::greater, int>           , compareScalarImpl<std::greater_equal, int>           , compareScalarImpl<std::less, int>           , compareScalarImpl<std::less_equal, int>           , compareScalarImpl<std::not_equal_to, int>           },
+            {compareScalarImpl<std::equal_to, float>         , compareScalarImpl<std::greater, float>         , compareScalarImpl<std::greater_equal, float>         , compareScalarImpl<std::less, float>         , compareScalarImpl<std::less_equal, float>         , compareScalarImpl<std::not_equal_to, float>         },
+            {compareScalarImpl<std::equal_to, double>        , compareScalarImpl<std::greater, double>        , compareScalarImpl<std::greater_equal, double>        , compareScalarImpl<std::less, double>        , compareScalarImpl<std::less_equal, double>        , compareScalarImpl<std::not_equal_to, double>        }
+        };
+
+        funcs[src.depth()][cmpop](src, sc, dst);
+    }
+}
+
+PARAM_TEST_CASE(Compare_Scalar, cv::gpu::DeviceInfo, cv::Size, MatType, CmpCode, UseRoi)
+{
+    cv::gpu::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int cmp_code;
+    bool useRoi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        cmp_code = GET_PARAM(3);
+        useRoi = GET_PARAM(4);
+
+        cv::gpu::setDevice(devInfo.deviceID());
+    }
+};
+
+GPU_TEST_P(Compare_Scalar, Accuracy)
+{
+    cv::Mat src = randomMat(size, type);
+    cv::Scalar sc = randomScalar(0.0, 255.0);
+
+    if (src.depth() < CV_32F)
+    {
+        sc.val[0] = cvRound(sc.val[0]);
+        sc.val[1] = cvRound(sc.val[1]);
+        sc.val[2] = cvRound(sc.val[2]);
+        sc.val[3] = cvRound(sc.val[3]);
+    }
+
+    if (src.depth() == CV_64F && !supportFeature(devInfo, cv::gpu::NATIVE_DOUBLE))
+    {
+        try
+        {
+            cv::gpu::GpuMat dst;
+            cv::gpu::compare(loadMat(src), sc, dst, cmp_code);
+        }
+        catch (const cv::Exception& e)
+        {
+            ASSERT_EQ(CV_StsUnsupportedFormat, e.code);
+        }
+    }
+    else
+    {
+        cv::gpu::GpuMat dst = createMat(size, CV_MAKE_TYPE(CV_8U, src.channels()), useRoi);
+
+        cv::gpu::compare(loadMat(src, useRoi), sc, dst, cmp_code);
+
+        cv::Mat dst_gold;
+        compareScalarGold(src, sc, dst_gold, cmp_code);
+
+        EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    TYPES(CV_8U, CV_64F, 1, 4),
+    ALL_CMP_CODES,
+    WHOLE_SUBMAT));
+
 //////////////////////////////////////////////////////////////////////////////
 // Bitwise_Array