updated patch to bring in the first functions with "transparent API"

2013-11-18 11:48:00 -05:00
parent bb4bf7a1f9
commit d914f20a4c
64 changed files with 13355 additions and 318 deletions
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@@ -347,6 +347,10 @@ CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst);
 CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst);
 //! computes per-element maximum of two arrays (dst = max(src1, src2))
 CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst);
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst);
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst);

 //! computes square root of each matrix element (dst = src**0.5)
 CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst);
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -58,6 +58,8 @@ namespace cv
 enum { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
    ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 };

+class CV_EXPORTS _OutputArray;
+
 //////////////////////// Input/Output Array Arguments /////////////////////////////////

 /*!
@@ -116,12 +118,22 @@ public:
    void* getObj() const;

    virtual int kind() const;
+    virtual int dims(int i=-1) const;
    virtual Size size(int i=-1) const;
+    virtual int sizend(int* sz, int i=-1) const;
+    virtual bool sameSize(const _InputArray& arr) const;
    virtual size_t total(int i=-1) const;
    virtual int type(int i=-1) const;
    virtual int depth(int i=-1) const;
    virtual int channels(int i=-1) const;
+    virtual bool isContinuous(int i=-1) const;
    virtual bool empty() const;
+    virtual void copyTo(const _OutputArray& arr) const;
+    bool isMat() const;
+    bool isUMat() const;
+    bool isMatVectot() const;
+    bool isUMatVector() const;
+    bool isMatx();

    virtual ~_InputArray();

@@ -197,8 +209,10 @@ public:
    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    virtual void createSameSize(const _InputArray& arr, int mtype) const;
    virtual void release() const;
    virtual void clear() const;
+    virtual void setTo(const _InputArray& value) const;
 };


--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -108,6 +108,12 @@ inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem)

 inline _InputArray::~_InputArray() {}

+inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
+inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
+inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; }
+inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::STD_VECTOR_UMAT; }
+inline bool _InputArray::isMatx()  { return kind() == _InputArray::MATX; }
+
 ////////////////////////////////////////////////////////////////////////////////////////

 inline _OutputArray::_OutputArray() { init(ACCESS_WRITE, 0); }
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -49,13 +49,13 @@ namespace cv { namespace ocl {
 CV_EXPORTS bool haveOpenCL();
 CV_EXPORTS bool useOpenCL();
 CV_EXPORTS void setUseOpenCL(bool flag);
-CV_EXPORTS void finish();
+CV_EXPORTS void finish2();

-class CV_EXPORTS Context;
+class CV_EXPORTS Context2;
 class CV_EXPORTS Device;
 class CV_EXPORTS Kernel;
 class CV_EXPORTS Program;
-class CV_EXPORTS ProgramSource;
+class CV_EXPORTS ProgramSource2;
 class CV_EXPORTS Queue;

 class CV_EXPORTS Device
@@ -199,22 +199,22 @@ protected:
 };


-class CV_EXPORTS Context
+class CV_EXPORTS Context2
 {
 public:
-    Context();
-    explicit Context(int dtype);
-    ~Context();
-    Context(const Context& c);
-    Context& operator = (const Context& c);
+    Context2();
+    explicit Context2(int dtype);
+    ~Context2();
+    Context2(const Context2& c);
+    Context2& operator = (const Context2& c);

    bool create(int dtype);
    size_t ndevices() const;
    const Device& device(size_t idx) const;
-    Program getProg(const ProgramSource& prog,
+    Program getProg(const ProgramSource2& prog,
                    const String& buildopt, String& errmsg);

-    static Context& getDefault();
+    static Context2& getDefault();
    void* ptr() const;
 protected:
    struct Impl;
@@ -226,12 +226,12 @@ class CV_EXPORTS Queue
 {
 public:
    Queue();
-    explicit Queue(const Context& c, const Device& d=Device());
+    explicit Queue(const Context2& c, const Device& d=Device());
    ~Queue();
    Queue(const Queue& q);
    Queue& operator = (const Queue& q);

-    bool create(const Context& c=Context(), const Device& d=Device());
+    bool create(const Context2& c=Context2(), const Device& d=Device());
    void finish();
    void* ptr() const;
    static Queue& getDefault();
@@ -245,41 +245,55 @@ protected:
 class CV_EXPORTS KernelArg
 {
 public:
-    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8 };
-    KernelArg(int _flags, UMat* _m, void* _obj=0, size_t _sz=0);
+    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, NO_SIZE=256 };
+    KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0);
+    KernelArg();

    static KernelArg Local() { return KernelArg(LOCAL, 0); }
-    static KernelArg ReadOnly(const UMat& m) { return KernelArg(READ_ONLY, (UMat*)&m); }
-    static KernelArg WriteOnly(const UMat& m) { return KernelArg(WRITE_ONLY, (UMat*)&m); }
+    static KernelArg ReadWrite(const UMat& m, int wscale=1)
+    { return KernelArg(READ_WRITE, (UMat*)&m, wscale); }
+    static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); }
+    static KernelArg ReadOnly(const UMat& m, int wscale=1)
+    { return KernelArg(READ_ONLY, (UMat*)&m, wscale); }
+    static KernelArg WriteOnly(const UMat& m, int wscale=1)
+    { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); }
+    static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); }
+    static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); }
    static KernelArg Constant(const Mat& m);
    template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
-    { return KernelArg(CONSTANT, 0, (void*)arr, n); }
+    { return KernelArg(CONSTANT, 0, 1, (void*)arr, n); }

    int flags;
    UMat* m;
-    void* obj;
+    const void* obj;
    size_t sz;
+    int wscale;
 };

+
 class CV_EXPORTS Kernel
 {
 public:
    Kernel();
    Kernel(const char* kname, const Program& prog);
-    Kernel(const char* kname, const ProgramSource& prog,
-           const String& buildopts, String& errmsg);
+    Kernel(const char* kname, const ProgramSource2& prog,
+           const String& buildopts, String* errmsg=0);
    ~Kernel();
    Kernel(const Kernel& k);
    Kernel& operator = (const Kernel& k);

+    bool empty() const;
    bool create(const char* kname, const Program& prog);
-    bool create(const char* kname, const ProgramSource& prog,
-                const String& buildopts, String& errmsg);
+    bool create(const char* kname, const ProgramSource2& prog,
+                const String& buildopts, String* errmsg=0);

-    void set(int i, const void* value, size_t sz);
-    void set(int i, const UMat& m);
-    void set(int i, const KernelArg& arg);
-    template<typename _Tp> void set(int i, const _Tp& value)
+    int set(int i, const void* value, size_t sz);
+    int set(int i, const UMat& m);
+    int set(int i, const KernelArg& arg);
+    template<typename _Tp> int set(int i, const _Tp& value)
    { return set(i, &value, sizeof(value)); }

    template<typename _Tp0>
@@ -291,26 +305,27 @@ public:
    template<typename _Tp0, typename _Tp1>
    Kernel& args(const _Tp0& a0, const _Tp1& a1)
    {
-        set(0, a0); set(1, a1); return *this;
+        int i = set(0, a0); set(i, a1); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2)
    {
-        set(0, a0); set(1, a1); set(2, a2); return *this;
+        int i = set(0, a0); i = set(i, a1); set(i, a2); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
                 const _Tp3& a3, const _Tp4& a4)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2);
+        i = set(i, a3); set(i, a4); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2,
@@ -318,8 +333,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
                 const _Tp3& a3, const _Tp4& a4, const _Tp5& a5)
    {
-        set(0, a0); set(1, a1); set(2, a2);
-        set(3, a3); set(4, a4); set(5, a5); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2);
+        i = set(i, a3); i = set(i, a4); set(i, a5); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -327,8 +342,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
-        set(4, a4); set(5, a5); set(6, a6); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
+        i = set(i, a4); i = set(i, a5); set(i, a6); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -336,8 +351,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
-        set(4, a4); set(5, a5); set(6, a6); set(7, a7); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
+        i = set(i, a4); i = set(i, a5); i = set(i, a6); set(i, a7); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@@ -346,8 +361,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4);
-        set(5, a5); set(6, a6); set(7, a7); set(8, a8); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@@ -356,8 +371,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); set(i, a9); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -367,8 +382,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); set(i, a10); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@@ -378,13 +393,13 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10, const _Tp11& a11)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); set(11, a11); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); set(i, a11); return *this;
    }

-    void run(int dims, size_t offset[], size_t globalsize[],
+    bool run(int dims, size_t globalsize[],
             size_t localsize[], bool sync, const Queue& q=Queue());
-    void runTask(bool sync, const Queue& q=Queue());
+    bool runTask(bool sync, const Queue& q=Queue());

    size_t workGroupSize() const;
    bool compileWorkGroupSize(size_t wsz[]) const;
@@ -401,7 +416,7 @@ class CV_EXPORTS Program
 {
 public:
    Program();
-    Program(const ProgramSource& src,
+    Program(const ProgramSource2& src,
            const String& buildflags, String& errmsg);
    explicit Program(const String& buf);
    Program(const Program& prog);
@@ -409,12 +424,12 @@ public:
    Program& operator = (const Program& prog);
    ~Program();

-    bool create(const ProgramSource& src,
+    bool create(const ProgramSource2& src,
                const String& buildflags, String& errmsg);
    bool read(const String& buf, const String& buildflags);
    bool write(String& buf) const;

-    const ProgramSource& source() const;
+    const ProgramSource2& source() const;
    void* ptr() const;

    String getPrefix() const;
@@ -426,17 +441,17 @@ protected:
 };


-class CV_EXPORTS ProgramSource
+class CV_EXPORTS ProgramSource2
 {
 public:
    typedef uint64 hash_t;

-    ProgramSource();
-    explicit ProgramSource(const String& prog);
-    explicit ProgramSource(const char* prog);
-    ~ProgramSource();
-    ProgramSource(const ProgramSource& prog);
-    ProgramSource& operator = (const ProgramSource& prog);
+    ProgramSource2();
+    explicit ProgramSource2(const String& prog);
+    explicit ProgramSource2(const char* prog);
+    ~ProgramSource2();
+    ProgramSource2(const ProgramSource2& prog);
+    ProgramSource2& operator = (const ProgramSource2& prog);

    const String& source() const;
    hash_t hash() const;
@@ -446,6 +461,10 @@ protected:
    Impl* p;
 };

+CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
+CV_EXPORTS const char* typeToStr(int t);
+CV_EXPORTS const char* memopTypeToStr(int t);
+
 }}

 #endif
--- a/modules/core/include/opencv2/core/ocl_genbase.hpp
+++ b/modules/core/include/opencv2/core/ocl_genbase.hpp
@@ -0,0 +1,60 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_OPENCL_GENBASE_HPP__
+#define __OPENCV_OPENCL_GENBASE_HPP__
+
+namespace cv
+{
+namespace ocl
+{
+
+struct ProgramEntry
+{
+    const char* name;
+    const char* programStr;
+    const char* programHash;
+};
+
+}
+}
+
+#endif
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@@ -911,33 +911,112 @@ void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t bl
        scbuf[i] = scbuf[i - esz];
 }

-static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
-               InputArray _mask, const BinaryFunc* tab, bool bitwise)
+
+enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
+       OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
+       OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14 };
+
+static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
+    "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
+    "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", 0 };
+
+static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                          InputArray _mask, bool bitwise, int oclop, bool haveScalar )
 {
-    int kind1 = _src1.kind(), kind2 = _src2.kind();
-    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+    bool haveMask = !_mask.empty();
+    int srctype = _src1.type();
+    int srcdepth = CV_MAT_DEPTH(srctype);
+    int cn = CV_MAT_CN(srctype);
+
+    if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) )
+        return false;
+
+    UMat src1 = _src1.getUMat(), src2;
+    UMat dst = _dst.getUMat(), mask = _mask.getUMat();
+
+    char opts[1024];
+    int kercn = haveMask || haveScalar ? cn : 1;
+    sprintf(opts, "-D %s%s -D %s -D dstT=%s",
+            (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop],
+            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
+            ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)));
+
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
+    if( k.empty() )
+        return false;
+
+    int cscale = cn/kercn;
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
+    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
+                                       ocl::KernelArg::WriteOnly(dst, cscale);
+    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
+
+    if( haveScalar )
+    {
+        size_t esz = CV_ELEM_SIZE(srctype);
+        double buf[4] = {0,0,0,0};
+
+        if( oclop != OCL_OP_NOT )
+        {
+            Mat src2sc = _src2.getMat();
+            convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
+        }
+
+        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
+
+        if( !haveMask )
+            k.args(src1arg, dstarg, scalararg);
+        else
+            k.args(src1arg, maskarg, dstarg, scalararg);
+    }
+    else
+    {
+        src2 = _src2.getUMat();
+        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
+
+        if( !haveMask )
+            k.args(src1arg, src2arg, dstarg);
+        else
+            k.args(src1arg, src2arg, maskarg, dstarg);
+    }
+
+    size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows };
+    return k.run(2, globalsize, 0, false);
+}
+
+
+static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
+                       InputArray _mask, const BinaryFunc* tab,
+                       bool bitwise, int oclop )
+{
+    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
+    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
+    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
+    int dims1 = psrc1->dims(), dims2 = psrc2->dims();
+    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
+    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
+    bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
+                        ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
    bool haveMask = !_mask.empty(), haveScalar = false;
    BinaryFunc func;
-    int c;

-    if( src1.dims <= 2 && src2.dims <= 2 && kind1 == kind2 &&
-        src1.size() == src2.size() && src1.type() == src2.type() && !haveMask )
+    if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
    {
-        _dst.create(src1.size(), src1.type());
-        Mat dst = _dst.getMat();
+        _dst.create(sz1, type1);
+        if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false) )
+            return;
        if( bitwise )
        {
            func = *tab;
-            c = (int)src1.elemSize();
+            cn = (int)CV_ELEM_SIZE(type1);
        }
        else
-        {
-            func = tab[src1.depth()];
-            c = src1.channels();
-        }
+            func = tab[depth1];

+        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
        Size sz = getContinuousSize(src1, src2, dst);
-        size_t len = sz.width*(size_t)c;
+        size_t len = sz.width*(size_t)cn;
        if( len == (size_t)(int)len )
        {
            sz.width = (int)len;
@@ -946,56 +1025,67 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
        }
    }

-    if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
-        src1.size != src2.size || src1.type() != src2.type() )
+    if( oclop == OCL_OP_NOT )
+        haveScalar = true;
+    else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
+        !psrc1->sameSize(*psrc2) || type1 != type2 )
    {
-        if( checkScalar(src1, src2.type(), kind1, kind2) )
+        if( checkScalar(*psrc1, type2, kind1, kind2) )
+        {
            // src1 is a scalar; swap it with src2
-            swap(src1, src2);
-        else if( !checkScalar(src2, src1.type(), kind2, kind1) )
+            swap(psrc1, psrc2);
+            swap(type1, type2);
+            swap(depth1, depth2);
+            swap(cn, cn2);
+            swap(sz1, sz2);
+        }
+        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
            CV_Error( CV_StsUnmatchedSizes,
                      "The operation is neither 'array op array' (where arrays have the same size and type), "
                      "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
    }
+    else
+    {
+        CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
+    }

-    size_t esz = src1.elemSize();
+    size_t esz = CV_ELEM_SIZE(type1);
    size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
-    int cn = src1.channels();
    BinaryFunc copymask = 0;
-    Mat mask;
    bool reallocate = false;

    if( haveMask )
    {
-        mask = _mask.getMat();
-        CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) );
-        CV_Assert( mask.size == src1.size );
+        int mtype = _mask.type();
+        CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
        copymask = getCopyMaskFunc(esz);
-        Mat tdst = _dst.getMat();
-        reallocate = tdst.size != src1.size || tdst.type() != src1.type();
+        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
    }

    AutoBuffer<uchar> _buf;
    uchar *scbuf = 0, *maskbuf = 0;

-    _dst.create(src1.dims, src1.size, src1.type());
-    Mat dst = _dst.getMat();
-
+    _dst.createSameSize(*psrc1, type1);
    // if this is mask operation and dst has been reallocated,
-    // we have to
+    // we have to clear the destination
    if( haveMask && reallocate )
-        dst = Scalar::all(0);
+        _dst.setTo(0.);
+
+    if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar ))
+        return;
+
+    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
+    Mat dst = _dst.getMat(), mask = _mask.getMat();

    if( bitwise )
    {
        func = *tab;
-        c = (int)esz;
+        cn = (int)esz;
    }
    else
    {
-        func = tab[src1.depth()];
-        c = cn;
+        func = tab[depth1];
    }

    if( !haveScalar )
@@ -1006,8 +1096,8 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;

-        if( blocksize*c > INT_MAX )
-            blocksize = INT_MAX/c;
+        if( blocksize*cn > INT_MAX )
+            blocksize = INT_MAX/cn;

        if( haveMask )
        {
@@ -1022,7 +1112,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
            {
                int bsz = (int)MIN(total - j, blocksize);

-                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 );
+                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
                if( haveMask )
                {
                    copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
@@ -1054,7 +1144,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
            {
                int bsz = (int)MIN(total - j, blocksize);

-                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*c, 1), 0 );
+                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
                if( haveMask )
                {
                    copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
@@ -1101,47 +1191,59 @@ static BinaryFunc* getMinTab()
 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
-    binary_op(a, b, c, mask, &f, true);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
 }

 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
-    binary_op(a, b, c, mask, &f, true);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
 }

 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
-    binary_op(a, b, c, mask, &f, true);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
 }

 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
-    binary_op(a, a, c, mask, &f, true);
+    binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
 }

 void cv::max( InputArray src1, InputArray src2, OutputArray dst )
 {
-    binary_op(src1, src2, dst, noArray(), getMaxTab(), false );
+    binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 }

 void cv::min( InputArray src1, InputArray src2, OutputArray dst )
 {
-    binary_op(src1, src2, dst, noArray(), getMinTab(), false );
+    binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 }

 void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
 {
    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false );
+    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 }

 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
 {
    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMinTab(), false );
+    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
+}
+
+void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
+{
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
+}
+
+void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
+{
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 }


@@ -1171,73 +1273,213 @@ static int actualScalarDepth(const double* data, int len)
        CV_32S;
 }

-static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
-               InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, void* usrdata=0)
+
+static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                          InputArray _mask, int wtype,
+                          void* usrdata, int oclop,
+                          bool haveScalar )
 {
-    int kind1 = _src1.kind(), kind2 = _src2.kind();
-    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    bool haveMask = !_mask.empty();
+
+    if( (haveMask || haveScalar) && cn > 4 )
+        return false;
+
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = CV_MAT_DEPTH(wtype);
+    wtype = CV_MAKETYPE(wdepth, cn);
+    int type2 = haveScalar ? _src2.type() : wtype, depth2 = CV_MAT_DEPTH(type2);
+
+    UMat src1 = _src1.getUMat(), src2;
+    UMat dst = _dst.getUMat(), mask = _mask.getUMat();
+
+    char opts[1024];
+    int kercn = haveMask || haveScalar ? cn : 1;
+
+    if( (depth1 == depth2 || haveScalar) && ddepth == depth1 && wdepth == depth1 )
+    {
+        const char* oclopstr = oclop2str[oclop];
+        if( wdepth <= CV_16S )
+        {
+            oclopstr = oclop == OCL_OP_ADD ? "OCL_OP_ADD_SAT" :
+                       oclop == OCL_OP_SUB ? "OCL_OP_SUB_SAT" :
+                       oclop == OCL_OP_RSUB ? "OCL_OP_RSUB_SAT" : oclopstr;
+        }
+        sprintf(opts, "-D %s%s -D %s -D dstT=%s",
+                (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
+                oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)));
+    }
+    else
+    {
+        char cvtstr[3][32];
+        sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s "
+                "-D dstT=%s -D workT=%s -D convertToWT1=%s "
+                "-D convertToWT2=%s -D convertToDT=%s",
+                (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
+                oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
+                ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
+                ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
+                ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
+                ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
+                ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
+                ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]));
+    }
+
+    const uchar* usrdata_p = (const uchar*)usrdata;
+    const double* usrdata_d = (const double*)usrdata;
+    float usrdata_f[3];
+    int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
+        oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
+    if( n > 0 && wdepth == CV_32F )
+    {
+        for( i = 0; i < n; i++ )
+            usrdata_f[i] = (float)usrdata_d[i];
+        usrdata_p = (const uchar*)usrdata_f;
+    }
+    size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
+
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
+    if( k.empty() )
+        return false;
+
+    int cscale = cn/kercn;
+
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
+    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
+                                       ocl::KernelArg::WriteOnly(dst, cscale);
+    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
+
+    if( haveScalar )
+    {
+        size_t esz = CV_ELEM_SIZE(wtype);
+        double buf[4]={0,0,0,0};
+        Mat src2sc = _src2.getMat();
+
+        if( !src2sc.empty() )
+        {
+            convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
+        }
+        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
+
+        if( !haveMask )
+            k.args(src1arg, dstarg, scalararg);
+        else
+            k.args(src1arg, maskarg, dstarg, scalararg);
+    }
+    else
+    {
+        src2 = _src2.getUMat();
+        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
+
+        if( !haveMask )
+        {
+            if(n == 0)
+                k.args(src1arg, src2arg, dstarg);
+            else if(n == 1)
+                k.args(src1arg, src2arg, dstarg,
+                       ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz));
+            else if(n == 3)
+                k.args(src1arg, src2arg, dstarg,
+                       ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz),
+                       ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
+                       ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
+            else
+                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
+        }
+        else
+        {
+            k.args(src1arg, src2arg, maskarg, dstarg);
+        }
+    }
+
+    size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows };
+    return k.run(2, globalsize, 0, false);
+}
+
+
+static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                      InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
+                      void* usrdata=0, int oclop=-1 )
+{
+    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
+    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
    bool haveMask = !_mask.empty();
    bool reallocate = false;
+    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
+    int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
+    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
+    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
+    bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
+                        ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
+    bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
+    bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);

-    bool src1Scalar = checkScalar(src1, src2.type(), kind1, kind2);
-    bool src2Scalar = checkScalar(src2, src1.type(), kind2, kind1);
-
-    if( (kind1 == kind2 || src1.channels() == 1) && src1.dims <= 2 && src2.dims <= 2 &&
-        src1.size() == src2.size() && src1.type() == src2.type() &&
-        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == src1.depth())) ||
-                       (_dst.fixedType() && _dst.type() == _src1.type())) &&
+    if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
+        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
+                       (_dst.fixedType() && _dst.type() == type1)) &&
        ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
    {
-        _dst.create(src1.size(), src1.type());
-        Mat dst = _dst.getMat();
+        _dst.createSameSize(*psrc1, type1);
+        if( use_opencl &&
+            ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
+                          (!usrdata ? type1 : std::max(depth1, CV_32F)),
+                          usrdata, oclop, false))
+            return;
+        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
-        tab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
+        tab[depth1](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
        return;
    }

    bool haveScalar = false, swapped12 = false;
-    int depth2 = src2.depth();
-    if( src1.size != src2.size || src1.channels() != src2.channels() ||
+
+    if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
        ((kind1 == _InputArray::MATX || kind2 == _InputArray::MATX) &&
-         src1.cols == 1 && src2.rows == 4) )
+         (sz1 == Size(1,4) || sz2 == Size(1,4))) )
    {
-        if( checkScalar(src1, src2.type(), kind1, kind2) )
+        if( checkScalar(*psrc1, type2, kind1, kind2) )
        {
            // src1 is a scalar; swap it with src2
-            swap(src1, src2);
+            swap(psrc1, psrc2);
+            swap(sz1, sz2);
+            swap(type1, type2);
+            swap(depth1, depth2);
+            swap(cn, cn2);
+            swap(dims1, dims2);
            swapped12 = true;
+            if( oclop == OCL_OP_SUB )
+                oclop = OCL_OP_RSUB;
        }
-        else if( !checkScalar(src2, src1.type(), kind2, kind1) )
+        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
            CV_Error( CV_StsUnmatchedSizes,
-                     "The operation is neither 'array op array' (where arrays have the same size and the same number of channels), "
+                     "The operation is neither 'array op array' "
+                     "(where arrays have the same size and the same number of channels), "
                     "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
-        CV_Assert(src2.type() == CV_64F && (src2.rows == 4 || src2.rows == 1));
+        CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));

        if (!muldiv)
        {
-            depth2 = actualScalarDepth(src2.ptr<double>(), src1.channels());
-            if( depth2 == CV_64F && (src1.depth() < CV_32S || src1.depth() == CV_32F) )
+            Mat sc = psrc2->getMat();
+            depth2 = actualScalarDepth(sc.ptr<double>(), cn);
+            if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
                depth2 = CV_32F;
        }
        else
            depth2 = CV_64F;
    }

-    int cn = src1.channels(), depth1 = src1.depth(), wtype;
-    BinaryFunc cvtsrc1 = 0, cvtsrc2 = 0, cvtdst = 0;
-
    if( dtype < 0 )
    {
        if( _dst.fixedType() )
            dtype = _dst.type();
        else
        {
-            if( !haveScalar && src1.type() != src2.type() )
+            if( !haveScalar && type1 != type2 )
                CV_Error(CV_StsBadArg,
                     "When the input arrays in add/subtract/multiply/divide functions have different types, "
                     "the output array type must be explicitly specified");
-            dtype = src1.type();
+            dtype = type1;
        }
    }
    dtype = CV_MAT_DEPTH(dtype);
@@ -1262,39 +1504,41 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
        wtype = std::max(wtype, dtype);
    }

-    cvtsrc1 = depth1 == wtype ? 0 : getConvertFunc(depth1, wtype);
-    cvtsrc2 = depth2 == depth1 ? cvtsrc1 : depth2 == wtype ? 0 : getConvertFunc(depth2, wtype);
-    cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
-
    dtype = CV_MAKETYPE(dtype, cn);
    wtype = CV_MAKETYPE(wtype, cn);

-    size_t esz1 = src1.elemSize(), esz2 = src2.elemSize();
-    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
-    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
-    BinaryFunc copymask = 0;
-    Mat mask;
-
    if( haveMask )
    {
-        mask = _mask.getMat();
-        CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) );
-        CV_Assert( mask.size == src1.size );
-        copymask = getCopyMaskFunc(dsz);
-        Mat tdst = _dst.getMat();
-        reallocate = tdst.size != src1.size || tdst.type() != dtype;
+        int mtype = _mask.type();
+        CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
+        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
    }

+    _dst.createSameSize(*psrc1, dtype);
+    if( reallocate )
+        _dst.setTo(0.);
+
+    if( use_opencl &&
+        ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
+                      usrdata, oclop, haveScalar))
+        return;
+
+    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
+    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
+    BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
+
+    size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
+    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
+    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
+    BinaryFunc copymask = getCopyMaskFunc(dsz);
+    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
+
    AutoBuffer<uchar> _buf;
    uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
-    size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0);
-
-    _dst.create(src1.dims, src1.size, dtype);
-    Mat dst = _dst.getMat();
-
-    if( haveMask && reallocate )
-        dst = Scalar::all(0);
-
+    size_t bufesz = (cvtsrc1 ? wsz : 0) +
+                    (cvtsrc2 || haveScalar ? wsz : 0) +
+                    (cvtdst ? wsz : 0) +
+                    (haveMask ? dsz : 0);
    BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];

    if( !haveScalar )
@@ -1476,7 +1720,7 @@ static BinaryFunc* getAbsDiffTab()
 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
          InputArray mask, int dtype )
 {
-    arithm_op(src1, src2, dst, mask, dtype, getAddTab() );
+    arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
 }

 void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
@@ -1511,12 +1755,12 @@ void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
        }
    }
 #endif
-    arithm_op(src1, src2, dst, mask, dtype, getSubTab() );
+    arithm_op(src1, src2, dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
 }

 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
 {
-    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab());
+    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
 }

 /****************************************************************************************\
@@ -1847,19 +2091,20 @@ static BinaryFunc* getRecipTab()
 void cv::multiply(InputArray src1, InputArray src2,
                  OutputArray dst, double scale, int dtype)
 {
-    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), true, &scale);
+    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
+              true, &scale, scale == 1. ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
 }

 void cv::divide(InputArray src1, InputArray src2,
                OutputArray dst, double scale, int dtype)
 {
-    arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale);
+    arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
 }

 void cv::divide(double scale, InputArray src2,
                OutputArray dst, int dtype)
 {
-    arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale);
+    arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
 }

 /****************************************************************************************\
@@ -2020,7 +2265,7 @@ void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
                      double beta, double gamma, OutputArray dst, int dtype )
 {
    double scalars[] = {alpha, beta, gamma};
-    arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars);
+    arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
 }


--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@@ -220,6 +220,21 @@ void Mat::copyTo( OutputArray _dst ) const
        return;
    }

+    if( _dst.isUMat() )
+    {
+        _dst.create( dims, size.p, type() );
+        UMat dst = _dst.getUMat();
+
+        size_t i, sz[CV_MAX_DIM], dstofs[CV_MAX_DIM], esz = elemSize();
+        for( i = 0; i < (size_t)dims; i++ )
+            sz[i] = size.p[i];
+        sz[dims-1] *= esz;
+        dst.ndoffset(dstofs);
+        dstofs[dims-1] *= esz;
+        dst.u->currAllocator->upload(dst.u, data, dims, sz, dstofs, dst.step.p, step.p);
+        return;
+    }
+
    if( dims <= 2 )
    {
        _dst.create( rows, cols, type() );
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -1436,6 +1436,181 @@ Size _InputArray::size(int i) const
    }
 }

+
+int _InputArray::sizend(int* sz, int i) const
+{
+    int j, d=0, k = kind();
+
+    if( k == NONE )
+        ;
+    else if( k == MAT )
+    {
+        CV_Assert( i < 0 );
+        const Mat& m = *(const Mat*)obj;
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == UMAT )
+    {
+        CV_Assert( i < 0 );
+        const UMat& m = *(const UMat*)obj;
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == STD_VECTOR_MAT && i >= 0 )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        CV_Assert( i < (int)vv.size() );
+        const Mat& m = vv[i];
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == STD_VECTOR_UMAT && i >= 0 )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        CV_Assert( i < (int)vv.size() );
+        const UMat& m = vv[i];
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else
+    {
+        Size sz2d = size(i);
+        d = 2;
+        if(sz)
+        {
+            sz[0] = sz2d.height;
+            sz[1] = sz2d.width;
+        }
+    }
+
+    return d;
+}
+
+
+bool _InputArray::sameSize(const _InputArray& arr) const
+{
+    int k1 = kind(), k2 = arr.kind();
+    Size sz1;
+
+    if( k1 == MAT )
+    {
+        const Mat* m = ((const Mat*)obj);
+        if( k2 == MAT )
+            return m->size == ((const Mat*)arr.obj)->size;
+        if( k2 == UMAT )
+            return m->size == ((const UMat*)arr.obj)->size;
+        if( m->dims > 2 )
+            return false;
+        sz1 = m->size();
+    }
+    else if( k1 == UMAT )
+    {
+        const UMat* m = ((const UMat*)obj);
+        if( k2 == MAT )
+            return m->size == ((const Mat*)arr.obj)->size;
+        if( k2 == UMAT )
+            return m->size == ((const UMat*)arr.obj)->size;
+        if( m->dims > 2 )
+            return false;
+        sz1 = m->size();
+    }
+    else
+        sz1 = size();
+    if( arr.dims() > 2 )
+        return false;
+    return sz1 == arr.size();
+}
+
+int _InputArray::dims(int i) const
+{
+    int k = kind();
+
+    if( k == MAT )
+    {
+        CV_Assert( i < 0 );
+        return ((const Mat*)obj)->dims;
+    }
+
+    if( k == EXPR )
+    {
+        CV_Assert( i < 0 );
+        return ((const MatExpr*)obj)->a.dims;
+    }
+
+    if( k == UMAT )
+    {
+        CV_Assert( i < 0 );
+        return ((const UMat*)obj)->dims;
+    }
+
+    if( k == MATX )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == STD_VECTOR )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == NONE )
+        return 0;
+
+    if( k == STD_VECTOR_VECTOR )
+    {
+        const std::vector<std::vector<uchar> >& vv = *(const std::vector<std::vector<uchar> >*)obj;
+        if( i < 0 )
+            return 1;
+        CV_Assert( i < (int)vv.size() );
+        return 2;
+    }
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        if( i < 0 )
+            return 1;
+        CV_Assert( i < (int)vv.size() );
+
+        return vv[i].dims;
+    }
+
+    if( k == OPENGL_BUFFER )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == GPU_MAT )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+    
+    if( k == OCL_MAT )
+    {
+        return 2;
+    }
+    
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+}
+
 size_t _InputArray::total(int i) const
 {
    int k = kind();
@@ -1570,6 +1745,61 @@ bool _InputArray::empty() const
        return ((const cuda::CudaMem*)obj)->empty();
 }

+bool _InputArray::isContinuous(int i) const
+{
+    int k = kind();
+
+    if( k == MAT )
+        return i < 0 ? ((const Mat*)obj)->isContinuous() : true;
+
+    if( k == UMAT )
+        return i < 0 ? ((const UMat*)obj)->isContinuous() : true;
+
+    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+        return true;
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isContinuous();
+    }
+
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isContinuous();
+    }
+
+    CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    return false;
+}
+
+void _InputArray::copyTo(const _OutputArray& arr) const
+{
+    int k = kind();
+
+    if( k == NONE )
+        arr.release();
+    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    {
+        Mat m = getMat();
+        m.copyTo(arr);
+    }
+    else if( k == EXPR )
+    {
+        const MatExpr& e = *((MatExpr*)obj);
+        if( arr.kind() == MAT )
+            arr.getMatRef() = e;
+        else
+            Mat(e).copyTo(arr);
+    }
+    else if( k == UMAT )
+        ((UMat*)obj)->copyTo(arr);
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}

 bool _OutputArray::fixedSize() const
 {
@@ -1899,6 +2129,12 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i,
    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }

+void _OutputArray::createSameSize(const _InputArray& arr, int mtype) const
+{
+    int sz[CV_MAX_DIM], d = arr.sizend(sz);
+    create(d, sz, mtype);
+}
+
 void _OutputArray::release() const
 {
    CV_Assert(!fixedSize());
@@ -2010,6 +2246,23 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const
    return *(cuda::CudaMem*)obj;
 }

+void _OutputArray::setTo(const _InputArray& arr) const
+{
+    int k = kind();
+
+    if( k == NONE )
+        ;
+    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    {
+        Mat m = getMat();
+        m.setTo(arr);
+    }
+    else if( k == UMAT )
+        ((UMat*)obj)->setTo(arr);
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}
+
 static _InputOutputArray _none;
 InputOutputArray noArray() { return _none; }

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -592,9 +592,16 @@ static void* initOpenCLAndLoad(const char* funcname)
    {
        if(!initialized)
        {
-            handle = dlopen("/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL", RTLD_LAZY);
+            const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
+            oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
+                "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
+            handle = dlopen(oclpath, RTLD_LAZY);
            initialized = true;
            g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
+            if( g_haveOpenCL )
+                fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath);
+            else
+                fprintf(stderr, "Failed to load OpenCL runtime\n");
        }
        if(!handle)
            return 0;
@@ -1212,16 +1219,13 @@ namespace cv { namespace ocl {

 struct UMat2D
 {
-    UMat2D(const UMat& m, int accessFlags)
+    UMat2D(const UMat& m)
    {
-        CV_Assert(m.dims == 2);
-        data = (cl_mem)m.handle(accessFlags);
        offset = m.offset;
        step = m.step;
        rows = m.rows;
        cols = m.cols;
    }
-    cl_mem data;
    size_t offset;
    size_t step;
    int rows;
@@ -1230,10 +1234,8 @@ struct UMat2D

 struct UMat3D
 {
-    UMat3D(const UMat& m, int accessFlags)
+    UMat3D(const UMat& m)
    {
-        CV_Assert(m.dims == 3);
-        data = (cl_mem)m.handle(accessFlags);
        offset = m.offset;
        step = m.step.p[1];
        slicestep = m.step.p[0];
@@ -1241,7 +1243,6 @@ struct UMat3D
        rows = m.size.p[1];
        cols = m.size.p[2];
    }
-    cl_mem data;
    size_t offset;
    size_t slicestep;
    size_t step;
@@ -1315,7 +1316,7 @@ void setUseOpenCL(bool flag)
    }
 }

-void finish()
+void finish2()
 {
    Queue::getDefault().finish();
 }
@@ -1528,7 +1529,7 @@ String Device::OpenCLVersion() const
 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }

 String Device::driverVersion() const
-{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
+{ return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); }

 int Device::type() const
 { return p ? p->getProp<cl_device_type, int>(CL_DEVICE_TYPE) : 0; }
@@ -1705,14 +1706,14 @@ size_t Device::profilingTimerResolution() const

 const Device& Device::getDefault()
 {
-    const Context& ctx = Context::getDefault();
+    const Context2& ctx = Context2::getDefault();
    int idx = TLSData::get()->device;
    return ctx.device(idx);
 }

 /////////////////////////////////////////////////////////////////////////////////////////

-struct Context::Impl
+struct Context2::Impl
 {
    Impl(int dtype0)
    {
@@ -1777,7 +1778,7 @@ struct Context::Impl
        devices.clear();
    }

-    Program getProg(const ProgramSource& src,
+    Program getProg(const ProgramSource2& src,
                    const String& buildflags, String& errmsg)
    {
        String prefix = Program::getPrefix(buildflags);
@@ -1787,7 +1788,8 @@ struct Context::Impl
            return it->second;
        //String filename = format("%08x%08x_%08x%08x.clb2",
        Program prog(src, buildflags, errmsg);
-        phash.insert(std::pair<HashKey,Program>(k, prog));
+        if(prog.ptr())
+            phash.insert(std::pair<HashKey,Program>(k, prog));
        return prog;
    }

@@ -1797,7 +1799,7 @@ struct Context::Impl
    std::vector<Device> devices;
    bool initialized;

-    typedef ProgramSource::hash_t hash_t;
+    typedef ProgramSource2::hash_t hash_t;

    struct HashKey
    {
@@ -1812,18 +1814,18 @@ struct Context::Impl
 };


-Context::Context()
+Context2::Context2()
 {
    p = 0;
 }

-Context::Context(int dtype)
+Context2::Context2(int dtype)
 {
    p = 0;
    create(dtype);
 }

-bool Context::create(int dtype0)
+bool Context2::create(int dtype0)
 {
    if( !haveOpenCL() )
        return false;
@@ -1838,19 +1840,19 @@ bool Context::create(int dtype0)
    return p != 0;
 }

-Context::~Context()
+Context2::~Context2()
 {
    p->release();
 }

-Context::Context(const Context& c)
+Context2::Context2(const Context2& c)
 {
    p = (Impl*)c.p;
    if(p)
        p->addref();
 }

-Context& Context::operator = (const Context& c)
+Context2& Context2::operator = (const Context2& c)
 {
    Impl* newp = (Impl*)c.p;
    if(newp)
@@ -1861,30 +1863,30 @@ Context& Context::operator = (const Context& c)
    return *this;
 }

-void* Context::ptr() const
+void* Context2::ptr() const
 {
    return p->handle;
 }

-size_t Context::ndevices() const
+size_t Context2::ndevices() const
 {
    return p ? p->devices.size() : 0;
 }

-const Device& Context::device(size_t idx) const
+const Device& Context2::device(size_t idx) const
 {
    static Device dummy;
    return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
 }

-Context& Context::getDefault()
+Context2& Context2::getDefault()
 {
-    static Context ctx;
+    static Context2 ctx;
    if( !ctx.p && haveOpenCL() )
    {
-        // do not create new Context right away.
+        // do not create new Context2 right away.
        // First, try to retrieve existing context of the same type.
-        // In its turn, Platform::getContext() may call Context::create()
+        // In its turn, Platform::getContext() may call Context2::create()
        // if there is no such context.
        ctx.create(Device::TYPE_ACCELERATOR);
        if(!ctx.p)
@@ -1898,7 +1900,7 @@ Context& Context::getDefault()
    return ctx;
 }

-Program Context::getProg(const ProgramSource& prog,
+Program Context2::getProg(const ProgramSource2& prog,
                         const String& buildopts, String& errmsg)
 {
    return p ? p->getProg(prog, buildopts, errmsg) : Program();
@@ -1906,14 +1908,14 @@ Program Context::getProg(const ProgramSource& prog,

 struct Queue::Impl
 {
-    Impl(const Context& c, const Device& d)
+    Impl(const Context2& c, const Device& d)
    {
        refcount = 1;
-        const Context* pc = &c;
+        const Context2* pc = &c;
        cl_context ch = (cl_context)pc->ptr();
        if( !ch )
        {
-            pc = &Context::getDefault();
+            pc = &Context2::getDefault();
            ch = (cl_context)pc->ptr();
        }
        cl_device_id dh = (cl_device_id)d.ptr();
@@ -1943,7 +1945,7 @@ Queue::Queue()
    p = 0;
 }

-Queue::Queue(const Context& c, const Device& d)
+Queue::Queue(const Context2& c, const Device& d)
 {
    p = 0;
    create(c, d);
@@ -1973,7 +1975,7 @@ Queue::~Queue()
        p->release();
 }

-bool Queue::create(const Context& c, const Device& d)
+bool Queue::create(const Context2& c, const Device& d)
 {
    if(p)
        p->release();
@@ -1996,7 +1998,7 @@ Queue& Queue::getDefault()
 {
    Queue& q = TLSData::get()->oclQueue;
    if( !q.p )
-        q.create(Context::getDefault());
+        q.create(Context2::getDefault());
    return q;
 }

@@ -2008,15 +2010,20 @@ static cl_command_queue getQueue(const Queue& q)
    return qq;
 }

-KernelArg::KernelArg(int _flags, UMat* _m, void* _obj, size_t _sz)
-    : flags(_flags), m(_m), obj(_obj), sz(_sz)
+KernelArg::KernelArg()
+    : flags(0), m(0), obj(0), sz(0), wscale(1)
+{
+}
+
+KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, const void* _obj, size_t _sz)
+    : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale)
 {
 }

 KernelArg KernelArg::Constant(const Mat& m)
 {
    CV_Assert(m.isContinuous());
-    return KernelArg(CONSTANT, 0, m.data, m.total()*m.elemSize());
+    return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize());
 }


@@ -2099,8 +2106,8 @@ Kernel::Kernel(const char* kname, const Program& prog)
    create(kname, prog);
 }

-Kernel::Kernel(const char* kname, const ProgramSource& src,
-               const String& buildopts, String& errmsg)
+Kernel::Kernel(const char* kname, const ProgramSource2& src,
+               const String& buildopts, String* errmsg)
 {
    p = 0;
    create(kname, src, buildopts, errmsg);
@@ -2143,15 +2150,17 @@ bool Kernel::create(const char* kname, const Program& prog)
    return p != 0;
 }

-bool Kernel::create(const char* kname, const ProgramSource& src,
-                    const String& buildopts, String& errmsg)
+bool Kernel::create(const char* kname, const ProgramSource2& src,
+                    const String& buildopts, String* errmsg)
 {
    if(p)
    {
        p->release();
        p = 0;
    }
-    const Program& prog = Context::getDefault().getProg(src, buildopts, errmsg);
+    String tempmsg;
+    if( !errmsg ) errmsg = &tempmsg;
+    const Program& prog = Context2::getDefault().getProg(src, buildopts, *errmsg);
    return create(kname, prog);
 }

@@ -2160,55 +2169,91 @@ void* Kernel::ptr() const
    return p ? p->handle : 0;
 }

-void Kernel::set(int i, const void* value, size_t sz)
+bool Kernel::empty() const
 {
-    CV_Assert( p && clSetKernelArg(p->handle, (cl_uint)i, sz, value) >= 0 );
-    if( i == 0 )
-        p->cleanupUMats();
+    return ptr() == 0;
 }

-void Kernel::set(int i, const UMat& m)
+int Kernel::set(int i, const void* value, size_t sz)
 {
-    set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
-}
-
-void Kernel::set(int i, const KernelArg& arg)
-{
-    CV_Assert( p && p->handle );
+    CV_Assert(i >= 0);
    if( i == 0 )
        p->cleanupUMats();
+    if( !p || !p->handle || clSetKernelArg(p->handle, (cl_uint)i, sz, value) < 0 )
+        return -1;
+    return i+1;
+}
+
+int Kernel::set(int i, const UMat& m)
+{
+    return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
+}
+
+int Kernel::set(int i, const KernelArg& arg)
+{
+    CV_Assert( i >= 0 );
+    if( i == 0 )
+        p->cleanupUMats();
+    if( !p || !p->handle )
+        return -1;
    if( arg.m )
    {
        int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
                          ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
+        cl_mem h = (cl_mem)arg.m->handle(accessFlags);
+
        if( arg.m->dims <= 2 )
        {
-            UMat2D u2d(*arg.m, accessFlags);
-            clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d), &u2d);
+            UMat2D u2d(*arg.m);
+            clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
+            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
+            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
+            i += 3;
+
+            if( !(arg.flags & KernelArg::NO_SIZE) )
+            {
+                int cols = u2d.cols*arg.wscale;
+                clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
+                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.cols), &cols);
+                i += 2;
+            }
        }
        else
        {
-            UMat3D u3d(*arg.m, accessFlags);
-            clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d), &u3d);
+            UMat3D u3d(*arg.m);
+            clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
+            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
+            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
+            clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
+            i += 4;
+            if( !(arg.flags & KernelArg::NO_SIZE) )
+            {
+                int cols = u3d.cols*arg.wscale;
+                clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows);
+                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
+                clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
+                i += 3;
+            }
        }
        p->addUMat(*arg.m);
+        return i;
    }
-    else
-    {
-        clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
-    }
+    clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
+    return i+1;
 }


-void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsize[],
+bool Kernel::run(int dims, size_t globalsize[], size_t localsize[],
                 bool sync, const Queue& q)
 {
-    CV_Assert(p && p->handle && p->e == 0);
+    if(!p || !p->handle || p->e != 0)
+        return false;
    cl_command_queue qq = getQueue(q);
-    clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
-                           offset, globalsize, localsize, 0, 0,
-                           sync ? 0 : &p->e);
-    if( sync )
+    size_t offset[CV_MAX_DIM] = {0};
+    cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
+                                           offset, globalsize, localsize, 0, 0,
+                                           sync ? 0 : &p->e);
+    if( sync || retval < 0 )
    {
        clFinish(qq);
        p->cleanupUMats();
@@ -2218,14 +2263,17 @@ void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsiz
        p->addref();
        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
    }
+    return retval >= 0;
 }

-void Kernel::runTask(bool sync, const Queue& q)
+bool Kernel::runTask(bool sync, const Queue& q)
 {
-    CV_Assert(p && p->handle && p->e == 0);
+    if(!p || !p->handle || p->e != 0)
+        return false;
+
    cl_command_queue qq = getQueue(q);
-    clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
-    if( sync )
+    cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
+    if( sync || retval < 0 )
    {
        clFinish(qq);
        p->cleanupUMats();
@@ -2235,6 +2283,7 @@ void Kernel::runTask(bool sync, const Queue& q)
        p->addref();
        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
    }
+    return retval >= 0;
 }


@@ -2273,11 +2322,11 @@ size_t Kernel::localMemSize() const

 struct Program::Impl
 {
-    Impl(const ProgramSource& _src,
+    Impl(const ProgramSource2& _src,
         const String& _buildflags, String& errmsg)
    {
        refcount = 1;
-        const Context& ctx = Context::getDefault();
+        const Context2& ctx = Context2::getDefault();
        src = _src;
        buildflags = _buildflags;
        const String& srcstr = src.source();
@@ -2293,17 +2342,20 @@ struct Program::Impl
            void** deviceList = deviceListBuf;
            for( i = 0; i < n; i++ )
                deviceList[i] = ctx.device(i).ptr();
+            printf("Building the OpenCL program ...\n");
            retval = clBuildProgram(handle, n,
                                    (const cl_device_id*)deviceList,
                                    buildflags.c_str(), 0, 0);
            if( retval == CL_BUILD_PROGRAM_FAILURE )
            {
-                char buf[1024];
+                char buf[1<<16];
                size_t retsz = 0;
                clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG,
                                      sizeof(buf)-16, buf, &retsz);
                errmsg = String(buf);
+                CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str()));
            }
+            CV_Assert(retval >= 0);
        }
    }

@@ -2315,7 +2367,7 @@ struct Program::Impl
        if(_buf.empty())
            return;
        String prefix0 = Program::getPrefix(buildflags);
-        const Context& ctx = Context::getDefault();
+        const Context2& ctx = Context2::getDefault();
        const Device& dev = Device::getDefault();
        const char* pos0 = _buf.c_str();
        const char* pos1 = strchr(pos0, '\n');
@@ -2366,7 +2418,7 @@ struct Program::Impl

    IMPLEMENT_REFCOUNTABLE();

-    ProgramSource src;
+    ProgramSource2 src;
    String buildflags;
    cl_program handle;
 };
@@ -2374,7 +2426,7 @@ struct Program::Impl

 Program::Program() { p = 0; }

-Program::Program(const ProgramSource& src,
+Program::Program(const ProgramSource2& src,
        const String& buildflags, String& errmsg)
 {
    p = 0;
@@ -2405,7 +2457,7 @@ Program::~Program()
        p->release();
 }

-bool Program::create(const ProgramSource& src,
+bool Program::create(const ProgramSource2& src,
            const String& buildflags, String& errmsg)
 {
    if(p)
@@ -2419,9 +2471,9 @@ bool Program::create(const ProgramSource& src,
    return p != 0;
 }

-const ProgramSource& Program::source() const
+const ProgramSource2& Program::source() const
 {
-    static ProgramSource dummy;
+    static ProgramSource2 dummy;
    return p ? p->src : dummy;
 }

@@ -2455,7 +2507,7 @@ String Program::getPrefix() const

 String Program::getPrefix(const String& buildflags)
 {
-    const Context& ctx = Context::getDefault();
+    const Context2& ctx = Context2::getDefault();
    const Device& dev = ctx.device(0);
    return format("name=%s\ndriver=%s\nbuildflags=%s\n",
                  dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
@@ -2463,7 +2515,7 @@ String Program::getPrefix(const String& buildflags)

 ////////////////////////////////////////////////////////////////////////////////////////

-struct ProgramSource::Impl
+struct ProgramSource2::Impl
 {
    Impl(const char* _src)
    {
@@ -2482,39 +2534,39 @@ struct ProgramSource::Impl

    IMPLEMENT_REFCOUNTABLE();
    String src;
-    ProgramSource::hash_t h;
+    ProgramSource2::hash_t h;
 };


-ProgramSource::ProgramSource()
+ProgramSource2::ProgramSource2()
 {
    p = 0;
 }

-ProgramSource::ProgramSource(const char* prog)
+ProgramSource2::ProgramSource2(const char* prog)
 {
    p = new Impl(prog);
 }

-ProgramSource::ProgramSource(const String& prog)
+ProgramSource2::ProgramSource2(const String& prog)
 {
    p = new Impl(prog);
 }

-ProgramSource::~ProgramSource()
+ProgramSource2::~ProgramSource2()
 {
    if(p)
        p->release();
 }

-ProgramSource::ProgramSource(const ProgramSource& prog)
+ProgramSource2::ProgramSource2(const ProgramSource2& prog)
 {
    p = prog.p;
    if(p)
        p->addref();
 }

-ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
+ProgramSource2& ProgramSource2::operator = (const ProgramSource2& prog)
 {
    Impl* newp = (Impl*)prog.p;
    if(newp)
@@ -2525,13 +2577,13 @@ ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
    return *this;
 }

-const String& ProgramSource::source() const
+const String& ProgramSource2::source() const
 {
    static String dummy;
    return p ? p->src : dummy;
 }

-ProgramSource::hash_t ProgramSource::hash() const
+ProgramSource2::hash_t ProgramSource2::hash() const
 {
    return p ? p->h : 0;
 }
@@ -2551,7 +2603,7 @@ public:
        return u;
    }

-    void getBestFlags(const Context& ctx, int& createFlags, int& flags0) const
+    void getBestFlags(const Context2& ctx, int& createFlags, int& flags0) const
    {
        const Device& dev = ctx.device(0);
        createFlags = CL_MEM_READ_WRITE;
@@ -2574,7 +2626,7 @@ public:
            total *= sizes[i];
        }

-        Context& ctx = Context::getDefault();
+        Context2& ctx = Context2::getDefault();
        int createFlags = 0, flags0 = 0;
        getBestFlags(ctx, createFlags, flags0);

@@ -2603,7 +2655,7 @@ public:
        if(u->handle == 0)
        {
            CV_Assert(u->origdata != 0);
-            Context& ctx = Context::getDefault();
+            Context2& ctx = Context2::getDefault();
            int createFlags = 0, flags0 = 0;
            getBestFlags(ctx, createFlags, flags0);

@@ -2848,7 +2900,6 @@ public:
                            new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
                            new_dststep[0], new_dststep[1], dstptr, 0, 0, 0) >= 0 );
        }
-        clFinish(q);
    }

    void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
@@ -2890,6 +2941,9 @@ public:

        if( iscontinuous )
        {
+            int crc = 0;
+            for( size_t i = 0; i < total; i++ )
+                crc ^= ((uchar*)srcptr)[i];
            CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
                CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) >= 0 );
        }
@@ -2949,10 +3003,11 @@ public:
        }
        else
        {
-            CV_Assert( clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
+            cl_int retval;
+            CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
                                               new_srcofs, new_dstofs, new_sz,
                                               new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1],
-                                               0, 0, 0) >= 0 );
+                                               0, 0, 0)) >= 0 );
        }

        dst->markHostCopyObsolete(true);
@@ -2969,4 +3024,61 @@ MatAllocator* getOpenCLAllocator()
    return &allocator;
 }

+const char* typeToStr(int t)
+{
+    static const char* tab[]=
+    {
+        "uchar", "uchar2", "uchar3", "uchar4",
+        "char", "char2", "char3", "char4",
+        "ushort", "ushort2", "ushort3", "ushort4",
+        "short", "short2", "short3", "short4",
+        "int", "int2", "int3", "int4",
+        "float", "float2", "float3", "float4",
+        "double", "double2", "double3", "double4",
+        "?", "?", "?", "?"
+    };
+    int cn = CV_MAT_CN(t);
+    return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
+}
+
+const char* memopTypeToStr(int t)
+{
+    static const char* tab[]=
+    {
+        "uchar", "uchar2", "uchar3", "uchar4",
+        "uchar", "uchar2", "uchar3", "uchar4",
+        "ushort", "ushort2", "ushort3", "ushort4",
+        "ushort", "ushort2", "ushort3", "ushort4",
+        "int", "int2", "int3", "int4",
+        "int", "int2", "int3", "int4",
+        "long", "long2", "long3", "long4",
+        "?", "?", "?", "?"
+    };
+    int cn = CV_MAT_CN(t);
+    return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
+}
+
+const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
+{
+    if( sdepth == ddepth )
+        return "noconvert";
+    const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
+    if( ddepth >= CV_32F ||
+        (ddepth == CV_32S && sdepth < CV_32S) ||
+        (ddepth == CV_16S && sdepth <= CV_8S) ||
+        (ddepth == CV_16U && sdepth == CV_8U))
+    {
+        sprintf(buf, "convert_%s", typestr);
+    }
+    else if( sdepth >= CV_32F )
+    {
+        sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
+    }
+    else
+    {
+        sprintf(buf, "convert_%s_sat", typestr);
+    }
+    return buf;
+}
+
 }}
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@@ -0,0 +1,307 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+  Usage:
+     after compiling this program user gets a single kernel called KF.
+     the following flags should be passed:
+     1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP"
+     2) the actual operation performed, one of "-D OP_...", see below the list of operations.
+     2a) "-D dstDepth=<destination depth> [-D cn=<num channels]"
+         for some operations, like min/max/and/or/xor it's enough
+     2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth>
+          -D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations
+*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#define CV_32S 4
+#define CV_32F 5
+
+#define dstelem *(dstT*)(dstptr + dst_index)
+#define noconvert(x) x
+
+#ifndef workT
+
+    #define srcT1 dstT
+    #define srcT2 dstT
+    #define workT dstT
+    #define srcelem1 *(dstT*)(srcptr1 + src1_index)
+    #define srcelem2 *(dstT*)(srcptr2 + src2_index)
+    #define convertToDT noconvert
+
+#else
+
+    #define srcelem1 convertToWT1(*(srcT1*)(srcptr1 + src1_index))
+    #define srcelem2 convertToWT2(*(srcT2*)(srcptr2 + src2_index))
+
+#endif
+
+#define EXTRA_PARAMS
+
+#if defined OP_ADD_SAT
+#define PROCESS_ELEM dstelem = add_sat(srcelem1, srcelem2)
+
+#elif defined OP_ADD
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2)
+
+#elif defined OP_SUB_SAT
+#define PROCESS_ELEM dstelem = sub_sat(srcelem1, srcelem2)
+
+#elif defined OP_SUB
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2)
+
+#elif defined OP_RSUB_SAT
+#define PROCESS_ELEM dstelem = sub_sat(srcelem2, srcelem1)
+
+#elif defined OP_RSUB
+#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1)
+
+#elif defined OP_ABSDIFF
+#define PROCESS_ELEM dstelem = abs_diff(srcelem1, srcelem2)
+
+#elif defined OP_AND
+#define PROCESS_ELEM dstelem = srcelem1 & srcelem2
+
+#elif defined OP_OR
+#define PROCESS_ELEM dstelem = srcelem1 | srcelem2
+
+#elif defined OP_XOR
+#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2
+
+#elif defined OP_NOT
+#define PROCESS_ELEM dstelem = ~srcelem1
+
+#elif defined OP_MIN
+#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2)
+
+#elif defined OP_MAX
+#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2)
+
+#elif defined OP_MUL
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2)
+
+#elif defined OP_MUL_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale)
+
+#elif defined OP_DIV
+#define PROCESS_ELEM \
+        workT e2 = srcelem2, zero = (workT)(0); \
+        dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero)
+
+#elif defined OP_DIV_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM \
+        workT e2 = srcelem2, zero = (workT)(0); \
+        dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero)
+
+#elif defined OP_RECIP_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM \
+        workT e1 = srcelem1, zero = (workT)(0); \
+        dstelem = convertToDT(e1 != zero ? scale / e1 : zero)
+
+#elif defined OP_ADDW
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma)
+
+#elif defined OP_MAG
+#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2)
+
+#elif defined OP_PHASE_RADIANS
+#define PROCESS_ELEM \
+        workT tmp = atan2(srcelem2, srcelem1); \
+        if(tmp < 0) tmp += 6.283185307179586232; \
+        dstelem = tmp
+
+#elif defined OP_PHASE_DEGREES
+    #define PROCESS_ELEM \
+    workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \
+    if(tmp < 0) tmp += 360; \
+    dstelem = tmp
+
+#elif defined OP_EXP
+#define PROCESS_ELEM dstelem = exp(srcelem1)
+
+#elif defined OP_SQRT
+#define PROCESS_ELEM dstelem = sqrt(srcelem1)
+
+#elif defined OP_LOG
+#define PROCESS_ELEM dstelem = log(abs(srcelem1))
+
+#elif defined OP_CMP
+#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)
+
+#elif defined OP_CONVERT
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1)
+
+#elif defined OP_CONVERT_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT alpha, workT beta
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta)
+
+#else
+#error "unknown op type"
+#endif
+
+#if defined UNARY_OP || defined MASK_UNARY_OP
+#undef srcelem2
+#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
+    defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
+    defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX
+    #undef EXTRA_PARAMS
+    #define EXTRA_PARAMS , workT srcelem2
+#endif
+#endif
+
+#if defined BINARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+        int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+        PROCESS_ELEM;
+        //printf("(x=%d, y=%d). %d, %d, %d\n", x, y, (int)srcelem1, (int)srcelem2, (int)dstelem);
+    }
+}
+
+#elif defined MASK_BINARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
+                 __global const uchar* mask, int maskstep, int maskoffset,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+            int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+            PROCESS_ELEM;
+        }
+    }
+}
+
+#elif defined UNARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+        PROCESS_ELEM;
+    }
+}
+
+#elif defined MASK_UNARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* mask, int maskstep, int maskoffset,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+            PROCESS_ELEM;
+        }
+    }
+}
+
+#else
+
+#error "Unknown operation type"
+
+#endif
+
+
+
+
--- a/modules/core/src/opencl/copyset.cl
+++ b/modules/core/src/opencl/copyset.cl
@@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
+                      __global uchar* dstptr, int dststep, int dstoffset,
+                      int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+            *(dstT*)(dstptr + dst_index) = value;
+        }
+    }
+}
+
+__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
+                  int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+        *(dstT*)(dstptr + dst_index) = value;
+    }
+}
+
--- a/modules/core/src/opencl/mulspectrums.cl
+++ b/modules/core/src/opencl/mulspectrums.cl
@@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the uintel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business uinterruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+typedef float2 cfloat;
+inline cfloat cmulf(cfloat a, cfloat b)
+{
+    return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
+}
+
+inline cfloat conjf(cfloat a)
+{
+    return (cfloat)( a.x, - a.y );
+}
+
+__kernel void
+mulAndScaleSpectrumsKernel(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
+    uint mstep
+)
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], b[idx]);
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
+__kernel void
+mulAndScaleSpectrumsKernel_CONJ(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
+    uint mstep
+)
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], conjf(b[idx]));
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
--- a/modules/core/src/opencl/polarcart.cl
+++ b/modules/core/src/opencl/polarcart.cl
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset,
+                          __global uchar* dstptr, int dststep, int dstoffset,
+                          int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+            *(dstT*)(dstptr + dst_index) = value;
+        }
+    }
+}
+
+__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset,
+                          int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+        *(dstT*)(dstptr + dst_index) = value;
+    }
+}
--- a/modules/core/src/opencl/reductions.cl
+++ b/modules/core/src/opencl/reductions.cl
@@ -0,0 +1,104 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#if FUNC_SUM
+#define FUNC(a, b) b += a;
+#elif FUNC_ABS_SUM
+#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
+#elif FUNC_SQR_SUM
+#define FUNC(a, b) b += a * a;
+#else
+#error No sum function
+#endif
+
+/**************************************Array buffer SUM**************************************/
+
+__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                __global srcT *src, __global dstT *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+
+   __local dstT localmem_sum[128];
+   dstT sum = (dstT)(0), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       temp = convertToDstT(src[idx]);
+       FUNC(temp, sum);
+   }
+
+   if (lid > 127)
+       localmem_sum[lid - 128] = sum;
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   if (lid < 128)
+       localmem_sum[lid] = sum + localmem_sum[lid];
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if (lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+
+   if (lid == 0)
+       dst[gid] = localmem_sum[0];
+}
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@@ -67,6 +67,8 @@
 #define GET_OPTIMIZED(func) (func)
 #endif

+#include "opencl_kernels.hpp"
+
 namespace cv
 {

@@ -205,13 +207,30 @@ enum { BLOCK_SIZE = 1024 };

 inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
 {
-    if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() )
+    if( sc.dims > 2 || !sc.isContinuous() )
+        return false;
+    Size sz = sc.size();
+    if(sz.width != 1 && sz.height != 1)
        return false;
    int cn = CV_MAT_CN(atype);
    if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
        return false;
-    return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) ||
-           (sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
+    return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
+           (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
+}
+
+inline bool checkScalar(InputArray sc, int atype, int sckind, int akind)
+{
+    if( sc.dims() > 2 || !sc.isContinuous() )
+        return false;
+    Size sz = sc.size();
+    if(sz.width != 1 && sz.height != 1)
+        return false;
+    int cn = CV_MAT_CN(atype);
+    if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
+        return false;
+    return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
+           (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
 }

 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize );
@@ -227,7 +246,10 @@ struct TLSData
    static TLSData* get();
 };

-namespace ocl { MatAllocator* getOpenCLAllocator(); }
+namespace ocl
+{
+    MatAllocator* getOpenCLAllocator();
+}

 }

--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@@ -197,6 +197,7 @@ UMat Mat::getUMat(int accessFlags) const
    if(!u)
        return hdr;
    UMat::getStdAllocator()->allocate(u, accessFlags);
+    hdr.flags = flags;
    setSize(hdr, dims, size.p, step.p);
    finalizeHdr(hdr);
    hdr.u = u;
@@ -548,7 +549,8 @@ Mat UMat::getMat(int accessFlags) const
    CV_Assert(u->data != 0);
    Mat hdr(dims, size.p, type(), u->data + offset, step.p);
    hdr.u = u;
-    hdr.datastart = hdr.data = u->data;
+    hdr.datastart = u->data;
+    hdr.data = hdr.datastart + offset;
    hdr.datalimit = hdr.dataend = u->data + u->size;
    CV_XADD(&hdr.u->refcount, 1);
    return hdr;
@@ -617,7 +619,7 @@ void UMat::copyTo(OutputArray _dst) const
        void* dsthandle = dst.handle(ACCESS_WRITE);
        if( srchandle == dsthandle && dst.offset == offset )
            return;
-        ndoffset(dstofs);
+        dst.ndoffset(dstofs);
        CV_Assert(u->currAllocator == dst.u->currAllocator);
        u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
    }
@@ -633,6 +635,50 @@ void UMat::convertTo(OutputArray, int, double, double) const
    CV_Error(Error::StsNotImplemented, "");
 }

+UMat& UMat::setTo(InputArray _value, InputArray _mask)
+{
+    bool haveMask = !_mask.empty();
+    int t = type(), cn = CV_MAT_CN(t);
+    if( dims <= 2 && cn <= 4 && ocl::useOpenCL() )
+    {
+        Mat value = _value.getMat();
+        CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
+        double buf[4];
+        convertAndUnrollScalar(value, t, (uchar*)buf, 1);
+
+        char opts[1024];
+        sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(t));
+
+        ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
+        if( !setK.empty() )
+        {
+            ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(t));
+            UMat mask;
+
+            if( haveMask )
+            {
+                mask = _mask.getUMat();
+                CV_Assert( mask.size() == size() && mask.type() == CV_8U );
+                ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+                ocl::KernelArg dstarg = ocl::KernelArg::ReadWrite(*this);
+                setK.args(maskarg, dstarg, scalararg);
+            }
+            else
+            {
+                ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(*this);
+                setK.args(dstarg, scalararg);
+            }
+
+            size_t globalsize[] = { cols, rows };
+            if( setK.run(2, globalsize, 0, false) )
+                return *this;
+        }
+    }
+    Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE);
+    m.setTo(_value, _mask);
+    return *this;
+}
+
 UMat& UMat::operator = (const Scalar&)
 {
    CV_Error(Error::StsNotImplemented, "");
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@@ -91,11 +91,11 @@ bool CV_UMatTest::TestUMat()
 {
    try
    {
-        Mat a(100, 100, CV_16S), b;
+        Mat a(100, 100, CV_16SC2), b, c;
        randu(a, Scalar::all(-100), Scalar::all(100));
-        Rect roi(1, 3, 10, 20);
-        Mat ra(a, roi), rb;
-        UMat ua, ura;
+        Rect roi(1, 3, 5, 4);
+        Mat ra(a, roi), rb, rc, rc0;
+        UMat ua, ura, ub, urb, uc, urc;
        a.copyTo(ua);
        ua.copyTo(b);
        CHECK_DIFF(a, b);
@@ -112,6 +112,71 @@ bool CV_UMatTest::TestUMat()
        }
        ra.copyTo(rb);
        CHECK_DIFF(ra, rb);
+
+        b = a.clone();
+        ra = a(roi);
+        rb = b(roi);
+        randu(b, Scalar::all(-100), Scalar::all(100));
+        b.copyTo(ub);
+        urb = ub(roi);
+
+        /*std::cout << "==============================================\nbefore op (CPU):\n";
+        std::cout << "ra: " << ra << std::endl;
+        std::cout << "rb: " << rb << std::endl;*/
+
+        ra.copyTo(ura);
+        rb.copyTo(urb);
+        ra.release();
+        rb.release();
+        ura.copyTo(ra);
+        urb.copyTo(rb);
+
+        /*std::cout << "==============================================\nbefore op (GPU):\n";
+        std::cout << "ra: " << ra << std::endl;
+        std::cout << "rb: " << rb << std::endl;*/
+
+        cv::max(ra, rb, rc);
+        cv::max(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        /*std::cout << "==============================================\nafter op:\n";
+        std::cout << "rc: " << rc << std::endl;
+        std::cout << "rc0: " << rc0 << std::endl;*/
+
+        CHECK_DIFF(rc0, rc);
+
+        {
+        UMat tmp = rc0.getUMat(ACCESS_WRITE);
+        cv::max(ura, urb, tmp);
+        }
+        CHECK_DIFF(rc0, rc);
+
+        ura.copyTo(urc);
+        cv::max(urc, urb, urc);
+        urc.copyTo(rc0);
+        CHECK_DIFF(rc0, rc);
+
+        rc = ra ^ rb;
+        cv::bitwise_xor(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        /*std::cout << "==============================================\nafter op:\n";
+        std::cout << "ra: " << rc0 << std::endl;
+        std::cout << "rc: " << rc << std::endl;*/
+
+        CHECK_DIFF(rc0, rc);
+
+        rc = ra + rb;
+        cv::add(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        CHECK_DIFF(rc0, rc);
+
+        cv::subtract(ra, Scalar::all(5), rc);
+        cv::subtract(ura, Scalar::all(5), urc);
+        urc.copyTo(rc0);
+
+        CHECK_DIFF(rc0, rc);
    }
    catch (const test_excep& e)
    {