updated patch to bring in the first functions with "transparent API"

2013-11-18 11:48:00 -05:00 · 2013-11-18 11:48:00 -05:00 · d914f20a4c
commit d914f20a4c
parent bb4bf7a1f9
64 changed files with 13355 additions and 318 deletions
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@ -501,9 +501,10 @@ macro(ocv_glob_module_sources)
  file(GLOB cl_kernels "src/opencl/*.cl")
  if(HAVE_opencv_ocl AND cl_kernels)
    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+    string(REGEX REPLACE "opencv_" "" the_module_barename "${the_module}")
    add_custom_command(
      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
-      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      COMMAND ${CMAKE_COMMAND} -DMODULE_NAME="${the_module_barename}" -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
    source_group("OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp")
--- a/cmake/cl2cpp.cmake
+++ b/cmake/cl2cpp.cmake
@ -4,6 +4,15 @@ list(SORT cl_list)
 string(REPLACE ".cpp" ".hpp" OUTPUT_HPP "${OUTPUT}")
 get_filename_component(OUTPUT_HPP_NAME "${OUTPUT_HPP}" NAME)

+if("${MODULE_NAME}" STREQUAL "ocl")
+    set(nested_namespace_start "")
+    set(nested_namespace_end "")
+else()
+    set(new_mode ON)
+    set(nested_namespace_start "namespace ${MODULE_NAME}\n{")
+    set(nested_namespace_end "}") 
+endif()
+
 set(STR_CPP "// This file is auto-generated. Do not edit!

 #include \"precomp.hpp\"
@ -13,16 +22,19 @@ namespace cv
 {
 namespace ocl
 {
+${nested_namespace_start}
+
 ")

 set(STR_HPP "// This file is auto-generated. Do not edit!

-#include \"opencv2/ocl/private/util.hpp\"
+#include \"opencv2/core/ocl_genbase.hpp\"

 namespace cv
 {
 namespace ocl
 {
+${nested_namespace_start}

 ")

@ -49,12 +61,19 @@ foreach(cl ${cl_list})

  string(MD5 hash "${lines}")

-  set(STR_CPP "${STR_CPP}const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
-  set(STR_HPP "${STR_HPP}extern const struct ProgramEntry ${cl_filename};\n")
+  set(STR_CPP_DECL "const struct ProgramEntry ${cl_filename}={\"${cl_filename}\",\n\"${lines}, \"${hash}\"};\n")
+  set(STR_HPP_DECL "extern const struct ProgramEntry ${cl_filename};\n")
+  if(new_mode)
+    set(STR_CPP_DECL "${STR_CPP_DECL}ProgramSource2 ${cl_filename}_oclsrc(${cl_filename}.programStr);\n")
+    set(STR_HPP_DECL "${STR_HPP_DECL}extern ProgramSource2 ${cl_filename}_oclsrc;\n")
+  endif()
+
+  set(STR_CPP "${STR_CPP}${STR_CPP_DECL}")
+  set(STR_HPP "${STR_HPP}${STR_HPP_DECL}")
 endforeach()

-set(STR_CPP "${STR_CPP}}\n}\n")
-set(STR_HPP "${STR_HPP}}\n}\n")
+set(STR_CPP "${STR_CPP}}\n${nested_namespace_end}}\n")
+set(STR_HPP "${STR_HPP}}\n${nested_namespace_end}}\n")

 file(WRITE "${OUTPUT}" "${STR_CPP}")

--- a/modules/bioinspired/src/precomp.hpp
+++ b/modules/bioinspired/src/precomp.hpp
@ -47,6 +47,7 @@
 #include "opencv2/bioinspired.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/ocl.hpp"

 #include <valarray>

--- a/modules/bioinspired/src/retina_ocl.cpp
+++ b/modules/bioinspired/src/retina_ocl.cpp
@ -56,6 +56,8 @@

 namespace cv
 {
+static ocl::ProgramEntry retina_kernel = ocl::bioinspired::retina_kernel;
+
 namespace bioinspired
 {
 namespace ocl
--- a/modules/core/include/opencv2/core.hpp
+++ b/modules/core/include/opencv2/core.hpp
@ -347,6 +347,10 @@ CV_EXPORTS_W void max(InputArray src1, InputArray src2, OutputArray dst);
 CV_EXPORTS void min(const Mat& src1, const Mat& src2, Mat& dst);
 //! computes per-element maximum of two arrays (dst = max(src1, src2))
 CV_EXPORTS void max(const Mat& src1, const Mat& src2, Mat& dst);
+//! computes per-element minimum of two arrays (dst = min(src1, src2))
+CV_EXPORTS void min(const UMat& src1, const UMat& src2, UMat& dst);
+//! computes per-element maximum of two arrays (dst = max(src1, src2))
+CV_EXPORTS void max(const UMat& src1, const UMat& src2, UMat& dst);

 //! computes square root of each matrix element (dst = src**0.5)
 CV_EXPORTS_W void sqrt(InputArray src, OutputArray dst);
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@ -58,6 +58,8 @@ namespace cv
 enum { ACCESS_READ=1<<24, ACCESS_WRITE=1<<25,
    ACCESS_RW=3<<24, ACCESS_MASK=ACCESS_RW, ACCESS_FAST=1<<26 };

+class CV_EXPORTS _OutputArray;
+
 //////////////////////// Input/Output Array Arguments /////////////////////////////////

 /*!
@ -116,12 +118,22 @@ public:
    void* getObj() const;

    virtual int kind() const;
+    virtual int dims(int i=-1) const;
    virtual Size size(int i=-1) const;
+    virtual int sizend(int* sz, int i=-1) const;
+    virtual bool sameSize(const _InputArray& arr) const;
    virtual size_t total(int i=-1) const;
    virtual int type(int i=-1) const;
    virtual int depth(int i=-1) const;
    virtual int channels(int i=-1) const;
+    virtual bool isContinuous(int i=-1) const;
    virtual bool empty() const;
+    virtual void copyTo(const _OutputArray& arr) const;
+    bool isMat() const;
+    bool isUMat() const;
+    bool isMatVectot() const;
+    bool isUMatVector() const;
+    bool isMatx();

    virtual ~_InputArray();

@ -197,8 +209,10 @@ public:
    virtual void create(Size sz, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int rows, int cols, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
    virtual void create(int dims, const int* size, int type, int i=-1, bool allowTransposed=false, int fixedDepthMask=0) const;
+    virtual void createSameSize(const _InputArray& arr, int mtype) const;
    virtual void release() const;
    virtual void clear() const;
+    virtual void setTo(const _InputArray& value) const;
 };


--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@ -108,6 +108,12 @@ inline _InputArray::_InputArray(const cuda::CudaMem& cuda_mem)

 inline _InputArray::~_InputArray() {}

+inline bool _InputArray::isMat() const { return kind() == _InputArray::MAT; }
+inline bool _InputArray::isUMat() const  { return kind() == _InputArray::UMAT; }
+inline bool _InputArray::isMatVectot() const { return kind() == _InputArray::STD_VECTOR_MAT; }
+inline bool _InputArray::isUMatVector() const  { return kind() == _InputArray::STD_VECTOR_UMAT; }
+inline bool _InputArray::isMatx()  { return kind() == _InputArray::MATX; }
+
 ////////////////////////////////////////////////////////////////////////////////////////

 inline _OutputArray::_OutputArray() { init(ACCESS_WRITE, 0); }
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -49,13 +49,13 @@ namespace cv { namespace ocl {
 CV_EXPORTS bool haveOpenCL();
 CV_EXPORTS bool useOpenCL();
 CV_EXPORTS void setUseOpenCL(bool flag);
-CV_EXPORTS void finish();
+CV_EXPORTS void finish2();

-class CV_EXPORTS Context;
+class CV_EXPORTS Context2;
 class CV_EXPORTS Device;
 class CV_EXPORTS Kernel;
 class CV_EXPORTS Program;
-class CV_EXPORTS ProgramSource;
+class CV_EXPORTS ProgramSource2;
 class CV_EXPORTS Queue;

 class CV_EXPORTS Device
@ -199,22 +199,22 @@ protected:
 };


-class CV_EXPORTS Context
+class CV_EXPORTS Context2
 {
 public:
-    Context();
-    explicit Context(int dtype);
-    ~Context();
-    Context(const Context& c);
-    Context& operator = (const Context& c);
+    Context2();
+    explicit Context2(int dtype);
+    ~Context2();
+    Context2(const Context2& c);
+    Context2& operator = (const Context2& c);

    bool create(int dtype);
    size_t ndevices() const;
    const Device& device(size_t idx) const;
-    Program getProg(const ProgramSource& prog,
+    Program getProg(const ProgramSource2& prog,
                    const String& buildopt, String& errmsg);

-    static Context& getDefault();
+    static Context2& getDefault();
    void* ptr() const;
 protected:
    struct Impl;
@ -226,12 +226,12 @@ class CV_EXPORTS Queue
 {
 public:
    Queue();
-    explicit Queue(const Context& c, const Device& d=Device());
+    explicit Queue(const Context2& c, const Device& d=Device());
    ~Queue();
    Queue(const Queue& q);
    Queue& operator = (const Queue& q);

-    bool create(const Context& c=Context(), const Device& d=Device());
+    bool create(const Context2& c=Context2(), const Device& d=Device());
    void finish();
    void* ptr() const;
    static Queue& getDefault();
@ -245,41 +245,55 @@ protected:
 class CV_EXPORTS KernelArg
 {
 public:
-    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8 };
-    KernelArg(int _flags, UMat* _m, void* _obj=0, size_t _sz=0);
+    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, NO_SIZE=256 };
+    KernelArg(int _flags, UMat* _m, int wscale=1, const void* _obj=0, size_t _sz=0);
+    KernelArg();

    static KernelArg Local() { return KernelArg(LOCAL, 0); }
-    static KernelArg ReadOnly(const UMat& m) { return KernelArg(READ_ONLY, (UMat*)&m); }
-    static KernelArg WriteOnly(const UMat& m) { return KernelArg(WRITE_ONLY, (UMat*)&m); }
+    static KernelArg ReadWrite(const UMat& m, int wscale=1)
+    { return KernelArg(READ_WRITE, (UMat*)&m, wscale); }
+    static KernelArg ReadWriteNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(READ_WRITE+NO_SIZE, (UMat*)&m, wscale); }
+    static KernelArg ReadOnly(const UMat& m, int wscale=1)
+    { return KernelArg(READ_ONLY, (UMat*)&m, wscale); }
+    static KernelArg WriteOnly(const UMat& m, int wscale=1)
+    { return KernelArg(WRITE_ONLY, (UMat*)&m, wscale); }
+    static KernelArg ReadOnlyNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(READ_ONLY+NO_SIZE, (UMat*)&m, wscale); }
+    static KernelArg WriteOnlyNoSize(const UMat& m, int wscale=1)
+    { return KernelArg(WRITE_ONLY+NO_SIZE, (UMat*)&m, wscale); }
    static KernelArg Constant(const Mat& m);
    template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
-    { return KernelArg(CONSTANT, 0, (void*)arr, n); }
+    { return KernelArg(CONSTANT, 0, 1, (void*)arr, n); }

    int flags;
    UMat* m;
-    void* obj;
+    const void* obj;
    size_t sz;
+    int wscale;
 };

+
 class CV_EXPORTS Kernel
 {
 public:
    Kernel();
    Kernel(const char* kname, const Program& prog);
-    Kernel(const char* kname, const ProgramSource& prog,
-           const String& buildopts, String& errmsg);
+    Kernel(const char* kname, const ProgramSource2& prog,
+           const String& buildopts, String* errmsg=0);
    ~Kernel();
    Kernel(const Kernel& k);
    Kernel& operator = (const Kernel& k);

+    bool empty() const;
    bool create(const char* kname, const Program& prog);
-    bool create(const char* kname, const ProgramSource& prog,
-                const String& buildopts, String& errmsg);
+    bool create(const char* kname, const ProgramSource2& prog,
+                const String& buildopts, String* errmsg=0);

-    void set(int i, const void* value, size_t sz);
-    void set(int i, const UMat& m);
-    void set(int i, const KernelArg& arg);
-    template<typename _Tp> void set(int i, const _Tp& value)
+    int set(int i, const void* value, size_t sz);
+    int set(int i, const UMat& m);
+    int set(int i, const KernelArg& arg);
+    template<typename _Tp> int set(int i, const _Tp& value)
    { return set(i, &value, sizeof(value)); }

    template<typename _Tp0>
@ -291,26 +305,27 @@ public:
    template<typename _Tp0, typename _Tp1>
    Kernel& args(const _Tp0& a0, const _Tp1& a1)
    {
-        set(0, a0); set(1, a1); return *this;
+        int i = set(0, a0); set(i, a1); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2)
    {
-        set(0, a0); set(1, a1); set(2, a2); return *this;
+        int i = set(0, a0); i = set(i, a1); set(i, a2); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
                 const _Tp3& a3, const _Tp4& a4)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2);
+        i = set(i, a3); set(i, a4); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2,
@ -318,8 +333,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2,
                 const _Tp3& a3, const _Tp4& a4, const _Tp5& a5)
    {
-        set(0, a0); set(1, a1); set(2, a2);
-        set(3, a3); set(4, a4); set(5, a5); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2);
+        i = set(i, a3); i = set(i, a4); set(i, a5); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -327,8 +342,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
-        set(4, a4); set(5, a5); set(6, a6); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
+        i = set(i, a4); i = set(i, a5); set(i, a6); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -336,8 +351,8 @@ public:
    Kernel& args(const _Tp0& a0, const _Tp1& a1, const _Tp2& a2, const _Tp3& a3,
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3);
-        set(4, a4); set(5, a5); set(6, a6); set(7, a7); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3);
+        i = set(i, a4); i = set(i, a5); i = set(i, a6); set(i, a7); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@ -346,8 +361,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4);
-        set(5, a5); set(6, a6); set(7, a7); set(8, a8); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
@ -356,8 +371,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); set(i, a9); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -367,8 +382,8 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); set(i, a10); return *this;
    }

    template<typename _Tp0, typename _Tp1, typename _Tp2, typename _Tp3,
@ -378,13 +393,13 @@ public:
                 const _Tp4& a4, const _Tp5& a5, const _Tp6& a6, const _Tp7& a7,
                 const _Tp8& a8, const _Tp9& a9, const _Tp10& a10, const _Tp11& a11)
    {
-        set(0, a0); set(1, a1); set(2, a2); set(3, a3); set(4, a4); set(5, a5);
-        set(6, a6); set(7, a7); set(8, a8); set(9, a9); set(10, a10); set(11, a11); return *this;
+        int i = set(0, a0); i = set(i, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); i = set(i, a5);
+        i = set(i, a6); i = set(i, a7); i = set(i, a8); i = set(i, a9); i = set(i, a10); set(i, a11); return *this;
    }

-    void run(int dims, size_t offset[], size_t globalsize[],
+    bool run(int dims, size_t globalsize[],
             size_t localsize[], bool sync, const Queue& q=Queue());
-    void runTask(bool sync, const Queue& q=Queue());
+    bool runTask(bool sync, const Queue& q=Queue());

    size_t workGroupSize() const;
    bool compileWorkGroupSize(size_t wsz[]) const;
@ -401,7 +416,7 @@ class CV_EXPORTS Program
 {
 public:
    Program();
-    Program(const ProgramSource& src,
+    Program(const ProgramSource2& src,
            const String& buildflags, String& errmsg);
    explicit Program(const String& buf);
    Program(const Program& prog);
@ -409,12 +424,12 @@ public:
    Program& operator = (const Program& prog);
    ~Program();

-    bool create(const ProgramSource& src,
+    bool create(const ProgramSource2& src,
                const String& buildflags, String& errmsg);
    bool read(const String& buf, const String& buildflags);
    bool write(String& buf) const;

-    const ProgramSource& source() const;
+    const ProgramSource2& source() const;
    void* ptr() const;

    String getPrefix() const;
@ -426,17 +441,17 @@ protected:
 };


-class CV_EXPORTS ProgramSource
+class CV_EXPORTS ProgramSource2
 {
 public:
    typedef uint64 hash_t;

-    ProgramSource();
-    explicit ProgramSource(const String& prog);
-    explicit ProgramSource(const char* prog);
-    ~ProgramSource();
-    ProgramSource(const ProgramSource& prog);
-    ProgramSource& operator = (const ProgramSource& prog);
+    ProgramSource2();
+    explicit ProgramSource2(const String& prog);
+    explicit ProgramSource2(const char* prog);
+    ~ProgramSource2();
+    ProgramSource2(const ProgramSource2& prog);
+    ProgramSource2& operator = (const ProgramSource2& prog);

    const String& source() const;
    hash_t hash() const;
@ -446,6 +461,10 @@ protected:
    Impl* p;
 };

+CV_EXPORTS const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf);
+CV_EXPORTS const char* typeToStr(int t);
+CV_EXPORTS const char* memopTypeToStr(int t);
+
 }}

 #endif
--- a/modules/core/include/opencv2/core/ocl_genbase.hpp
+++ b/modules/core/include/opencv2/core/ocl_genbase.hpp
@ -0,0 +1,60 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_OPENCL_GENBASE_HPP__
+#define __OPENCV_OPENCL_GENBASE_HPP__
+
+namespace cv
+{
+namespace ocl
+{
+
+struct ProgramEntry
+{
+    const char* name;
+    const char* programStr;
+    const char* programHash;
+};
+
+}
+}
+
+#endif
--- a/modules/core/src/arithm.cpp
+++ b/modules/core/src/arithm.cpp
@ -911,33 +911,112 @@ void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t bl
        scbuf[i] = scbuf[i - esz];
 }

-static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
-               InputArray _mask, const BinaryFunc* tab, bool bitwise)
+
+enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
+       OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
+       OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14 };
+
+static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
+    "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
+    "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", 0 };
+
+static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                          InputArray _mask, bool bitwise, int oclop, bool haveScalar )
 {
-    int kind1 = _src1.kind(), kind2 = _src2.kind();
-    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+    bool haveMask = !_mask.empty();
+    int srctype = _src1.type();
+    int srcdepth = CV_MAT_DEPTH(srctype);
+    int cn = CV_MAT_CN(srctype);
+
+    if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) )
+        return false;
+
+    UMat src1 = _src1.getUMat(), src2;
+    UMat dst = _dst.getUMat(), mask = _mask.getUMat();
+
+    char opts[1024];
+    int kercn = haveMask || haveScalar ? cn : 1;
+    sprintf(opts, "-D %s%s -D %s -D dstT=%s",
+            (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"), oclop2str[oclop],
+            bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
+            ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)));
+
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
+    if( k.empty() )
+        return false;
+
+    int cscale = cn/kercn;
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
+    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
+                                       ocl::KernelArg::WriteOnly(dst, cscale);
+    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
+
+    if( haveScalar )
+    {
+        size_t esz = CV_ELEM_SIZE(srctype);
+        double buf[4] = {0,0,0,0};
+
+        if( oclop != OCL_OP_NOT )
+        {
+            Mat src2sc = _src2.getMat();
+            convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
+        }
+
+        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
+
+        if( !haveMask )
+            k.args(src1arg, dstarg, scalararg);
+        else
+            k.args(src1arg, maskarg, dstarg, scalararg);
+    }
+    else
+    {
+        src2 = _src2.getUMat();
+        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
+
+        if( !haveMask )
+            k.args(src1arg, src2arg, dstarg);
+        else
+            k.args(src1arg, src2arg, maskarg, dstarg);
+    }
+
+    size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows };
+    return k.run(2, globalsize, 0, false);
+}
+
+
+static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
+                       InputArray _mask, const BinaryFunc* tab,
+                       bool bitwise, int oclop )
+{
+    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
+    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
+    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
+    int dims1 = psrc1->dims(), dims2 = psrc2->dims();
+    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
+    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
+    bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
+                        ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
    bool haveMask = !_mask.empty(), haveScalar = false;
    BinaryFunc func;
-    int c;

-    if( src1.dims <= 2 && src2.dims <= 2 && kind1 == kind2 &&
-        src1.size() == src2.size() && src1.type() == src2.type() && !haveMask )
+    if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
    {
-        _dst.create(src1.size(), src1.type());
-        Mat dst = _dst.getMat();
+        _dst.create(sz1, type1);
+        if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false) )
+            return;
        if( bitwise )
        {
            func = *tab;
-            c = (int)src1.elemSize();
+            cn = (int)CV_ELEM_SIZE(type1);
        }
        else
-        {
-            func = tab[src1.depth()];
-            c = src1.channels();
-        }
+            func = tab[depth1];

+        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
        Size sz = getContinuousSize(src1, src2, dst);
-        size_t len = sz.width*(size_t)c;
+        size_t len = sz.width*(size_t)cn;
        if( len == (size_t)(int)len )
        {
            sz.width = (int)len;
@ -946,56 +1025,67 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
        }
    }

-    if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
-        src1.size != src2.size || src1.type() != src2.type() )
+    if( oclop == OCL_OP_NOT )
+        haveScalar = true;
+    else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
+        !psrc1->sameSize(*psrc2) || type1 != type2 )
    {
-        if( checkScalar(src1, src2.type(), kind1, kind2) )
+        if( checkScalar(*psrc1, type2, kind1, kind2) )
+        {
            // src1 is a scalar; swap it with src2
-            swap(src1, src2);
-        else if( !checkScalar(src2, src1.type(), kind2, kind1) )
+            swap(psrc1, psrc2);
+            swap(type1, type2);
+            swap(depth1, depth2);
+            swap(cn, cn2);
+            swap(sz1, sz2);
+        }
+        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
            CV_Error( CV_StsUnmatchedSizes,
                      "The operation is neither 'array op array' (where arrays have the same size and type), "
                      "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
    }
+    else
+    {
+        CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
+    }

-    size_t esz = src1.elemSize();
+    size_t esz = CV_ELEM_SIZE(type1);
    size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
-    int cn = src1.channels();
    BinaryFunc copymask = 0;
-    Mat mask;
    bool reallocate = false;

    if( haveMask )
    {
-        mask = _mask.getMat();
-        CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) );
-        CV_Assert( mask.size == src1.size );
+        int mtype = _mask.type();
+        CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
        copymask = getCopyMaskFunc(esz);
-        Mat tdst = _dst.getMat();
-        reallocate = tdst.size != src1.size || tdst.type() != src1.type();
+        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
    }

    AutoBuffer<uchar> _buf;
    uchar *scbuf = 0, *maskbuf = 0;

-    _dst.create(src1.dims, src1.size, src1.type());
-    Mat dst = _dst.getMat();
-
+    _dst.createSameSize(*psrc1, type1);
    // if this is mask operation and dst has been reallocated,
-    // we have to
+    // we have to clear the destination
    if( haveMask && reallocate )
-        dst = Scalar::all(0);
+        _dst.setTo(0.);
+
+    if( use_opencl && ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar ))
+        return;
+
+    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
+    Mat dst = _dst.getMat(), mask = _mask.getMat();

    if( bitwise )
    {
        func = *tab;
-        c = (int)esz;
+        cn = (int)esz;
    }
    else
    {
-        func = tab[src1.depth()];
-        c = cn;
+        func = tab[depth1];
    }

    if( !haveScalar )
@ -1006,8 +1096,8 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
        NAryMatIterator it(arrays, ptrs);
        size_t total = it.size, blocksize = total;

-        if( blocksize*c > INT_MAX )
-            blocksize = INT_MAX/c;
+        if( blocksize*cn > INT_MAX )
+            blocksize = INT_MAX/cn;

        if( haveMask )
        {
@ -1022,7 +1112,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
            {
                int bsz = (int)MIN(total - j, blocksize);

-                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*c, 1), 0 );
+                func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
                if( haveMask )
                {
                    copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
@ -1054,7 +1144,7 @@ static void binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
            {
                int bsz = (int)MIN(total - j, blocksize);

-                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*c, 1), 0 );
+                func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
                if( haveMask )
                {
                    copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
@ -1101,47 +1191,59 @@ static BinaryFunc* getMinTab()
 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
-    binary_op(a, b, c, mask, &f, true);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
 }

 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
-    binary_op(a, b, c, mask, &f, true);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
 }

 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
-    binary_op(a, b, c, mask, &f, true);
+    binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
 }

 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
 {
    BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
-    binary_op(a, a, c, mask, &f, true);
+    binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
 }

 void cv::max( InputArray src1, InputArray src2, OutputArray dst )
 {
-    binary_op(src1, src2, dst, noArray(), getMaxTab(), false );
+    binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 }

 void cv::min( InputArray src1, InputArray src2, OutputArray dst )
 {
-    binary_op(src1, src2, dst, noArray(), getMinTab(), false );
+    binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 }

 void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
 {
    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false );
+    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
 }

 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
 {
    OutputArray _dst(dst);
-    binary_op(src1, src2, _dst, noArray(), getMinTab(), false );
+    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
+}
+
+void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
+{
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
+}
+
+void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
+{
+    OutputArray _dst(dst);
+    binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
 }


@ -1171,73 +1273,213 @@ static int actualScalarDepth(const double* data, int len)
        CV_32S;
 }

-static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
-               InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false, void* usrdata=0)
+
+static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                          InputArray _mask, int wtype,
+                          void* usrdata, int oclop,
+                          bool haveScalar )
 {
-    int kind1 = _src1.kind(), kind2 = _src2.kind();
-    Mat src1 = _src1.getMat(), src2 = _src2.getMat();
+    int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    bool haveMask = !_mask.empty();
+
+    if( (haveMask || haveScalar) && cn > 4 )
+        return false;
+
+    int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = CV_MAT_DEPTH(wtype);
+    wtype = CV_MAKETYPE(wdepth, cn);
+    int type2 = haveScalar ? _src2.type() : wtype, depth2 = CV_MAT_DEPTH(type2);
+
+    UMat src1 = _src1.getUMat(), src2;
+    UMat dst = _dst.getUMat(), mask = _mask.getUMat();
+
+    char opts[1024];
+    int kercn = haveMask || haveScalar ? cn : 1;
+
+    if( (depth1 == depth2 || haveScalar) && ddepth == depth1 && wdepth == depth1 )
+    {
+        const char* oclopstr = oclop2str[oclop];
+        if( wdepth <= CV_16S )
+        {
+            oclopstr = oclop == OCL_OP_ADD ? "OCL_OP_ADD_SAT" :
+                       oclop == OCL_OP_SUB ? "OCL_OP_SUB_SAT" :
+                       oclop == OCL_OP_RSUB ? "OCL_OP_RSUB_SAT" : oclopstr;
+        }
+        sprintf(opts, "-D %s%s -D %s -D dstT=%s",
+                (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
+                oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)));
+    }
+    else
+    {
+        char cvtstr[3][32];
+        sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT2=%s "
+                "-D dstT=%s -D workT=%s -D convertToWT1=%s "
+                "-D convertToWT2=%s -D convertToDT=%s",
+                (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
+                oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
+                ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
+                ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
+                ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
+                ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
+                ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
+                ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]));
+    }
+
+    const uchar* usrdata_p = (const uchar*)usrdata;
+    const double* usrdata_d = (const double*)usrdata;
+    float usrdata_f[3];
+    int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
+        oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
+    if( n > 0 && wdepth == CV_32F )
+    {
+        for( i = 0; i < n; i++ )
+            usrdata_f[i] = (float)usrdata_d[i];
+        usrdata_p = (const uchar*)usrdata_f;
+    }
+    size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
+
+    ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
+    if( k.empty() )
+        return false;
+
+    int cscale = cn/kercn;
+
+    ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cscale);
+    ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cscale) :
+                                       ocl::KernelArg::WriteOnly(dst, cscale);
+    ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
+
+    if( haveScalar )
+    {
+        size_t esz = CV_ELEM_SIZE(wtype);
+        double buf[4]={0,0,0,0};
+        Mat src2sc = _src2.getMat();
+
+        if( !src2sc.empty() )
+        {
+            convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
+        }
+        ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, buf, esz);
+
+        if( !haveMask )
+            k.args(src1arg, dstarg, scalararg);
+        else
+            k.args(src1arg, maskarg, dstarg, scalararg);
+    }
+    else
+    {
+        src2 = _src2.getUMat();
+        ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cscale);
+
+        if( !haveMask )
+        {
+            if(n == 0)
+                k.args(src1arg, src2arg, dstarg);
+            else if(n == 1)
+                k.args(src1arg, src2arg, dstarg,
+                       ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz));
+            else if(n == 3)
+                k.args(src1arg, src2arg, dstarg,
+                       ocl::KernelArg(0, 0, 0, usrdata_p, usrdata_esz),
+                       ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
+                       ocl::KernelArg(0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
+            else
+                CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
+        }
+        else
+        {
+            k.args(src1arg, src2arg, maskarg, dstarg);
+        }
+    }
+
+    size_t globalsize[] = { src1.cols*(cn/kercn), src1.rows };
+    return k.run(2, globalsize, 0, false);
+}
+
+
+static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
+                      InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
+                      void* usrdata=0, int oclop=-1 )
+{
+    const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
+    int kind1 = psrc1->kind(), kind2 = psrc2->kind();
    bool haveMask = !_mask.empty();
    bool reallocate = false;
+    int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
+    int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
+    int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
+    Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
+    Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
+    bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
+                        ocl::useOpenCL() && dims1 <= 2 && dims2 <= 2;
+    bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
+    bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);

-    bool src1Scalar = checkScalar(src1, src2.type(), kind1, kind2);
-    bool src2Scalar = checkScalar(src2, src1.type(), kind2, kind1);
-
-    if( (kind1 == kind2 || src1.channels() == 1) && src1.dims <= 2 && src2.dims <= 2 &&
-        src1.size() == src2.size() && src1.type() == src2.type() &&
-        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == src1.depth())) ||
-                       (_dst.fixedType() && _dst.type() == _src1.type())) &&
+    if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
+        !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
+                       (_dst.fixedType() && _dst.type() == type1)) &&
        ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
    {
-        _dst.create(src1.size(), src1.type());
-        Mat dst = _dst.getMat();
+        _dst.createSameSize(*psrc1, type1);
+        if( use_opencl &&
+            ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
+                          (!usrdata ? type1 : std::max(depth1, CV_32F)),
+                          usrdata, oclop, false))
+            return;
+        Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
        Size sz = getContinuousSize(src1, src2, dst, src1.channels());
-        tab[src1.depth()](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
+        tab[depth1](src1.data, src1.step, src2.data, src2.step, dst.data, dst.step, sz, usrdata);
        return;
    }

    bool haveScalar = false, swapped12 = false;
-    int depth2 = src2.depth();
-    if( src1.size != src2.size || src1.channels() != src2.channels() ||
+
+    if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
        ((kind1 == _InputArray::MATX || kind2 == _InputArray::MATX) &&
-         src1.cols == 1 && src2.rows == 4) )
+         (sz1 == Size(1,4) || sz2 == Size(1,4))) )
    {
-        if( checkScalar(src1, src2.type(), kind1, kind2) )
+        if( checkScalar(*psrc1, type2, kind1, kind2) )
        {
            // src1 is a scalar; swap it with src2
-            swap(src1, src2);
+            swap(psrc1, psrc2);
+            swap(sz1, sz2);
+            swap(type1, type2);
+            swap(depth1, depth2);
+            swap(cn, cn2);
+            swap(dims1, dims2);
            swapped12 = true;
+            if( oclop == OCL_OP_SUB )
+                oclop = OCL_OP_RSUB;
        }
-        else if( !checkScalar(src2, src1.type(), kind2, kind1) )
+        else if( !checkScalar(*psrc2, type1, kind2, kind1) )
            CV_Error( CV_StsUnmatchedSizes,
-                     "The operation is neither 'array op array' (where arrays have the same size and the same number of channels), "
+                     "The operation is neither 'array op array' "
+                     "(where arrays have the same size and the same number of channels), "
                     "nor 'array op scalar', nor 'scalar op array'" );
        haveScalar = true;
-        CV_Assert(src2.type() == CV_64F && (src2.rows == 4 || src2.rows == 1));
+        CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));

        if (!muldiv)
        {
-            depth2 = actualScalarDepth(src2.ptr<double>(), src1.channels());
-            if( depth2 == CV_64F && (src1.depth() < CV_32S || src1.depth() == CV_32F) )
+            Mat sc = psrc2->getMat();
+            depth2 = actualScalarDepth(sc.ptr<double>(), cn);
+            if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
                depth2 = CV_32F;
        }
        else
            depth2 = CV_64F;
    }

-    int cn = src1.channels(), depth1 = src1.depth(), wtype;
-    BinaryFunc cvtsrc1 = 0, cvtsrc2 = 0, cvtdst = 0;
-
    if( dtype < 0 )
    {
        if( _dst.fixedType() )
            dtype = _dst.type();
        else
        {
-            if( !haveScalar && src1.type() != src2.type() )
+            if( !haveScalar && type1 != type2 )
                CV_Error(CV_StsBadArg,
                     "When the input arrays in add/subtract/multiply/divide functions have different types, "
                     "the output array type must be explicitly specified");
-            dtype = src1.type();
+            dtype = type1;
        }
    }
    dtype = CV_MAT_DEPTH(dtype);
@ -1262,39 +1504,41 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
        wtype = std::max(wtype, dtype);
    }

-    cvtsrc1 = depth1 == wtype ? 0 : getConvertFunc(depth1, wtype);
-    cvtsrc2 = depth2 == depth1 ? cvtsrc1 : depth2 == wtype ? 0 : getConvertFunc(depth2, wtype);
-    cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
-
    dtype = CV_MAKETYPE(dtype, cn);
    wtype = CV_MAKETYPE(wtype, cn);

-    size_t esz1 = src1.elemSize(), esz2 = src2.elemSize();
-    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
-    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
-    BinaryFunc copymask = 0;
-    Mat mask;
-
    if( haveMask )
    {
-        mask = _mask.getMat();
-        CV_Assert( (mask.type() == CV_8UC1 || mask.type() == CV_8SC1) );
-        CV_Assert( mask.size == src1.size );
-        copymask = getCopyMaskFunc(dsz);
-        Mat tdst = _dst.getMat();
-        reallocate = tdst.size != src1.size || tdst.type() != dtype;
+        int mtype = _mask.type();
+        CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
+        reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
    }

+    _dst.createSameSize(*psrc1, dtype);
+    if( reallocate )
+        _dst.setTo(0.);
+
+    if( use_opencl &&
+        ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
+                      usrdata, oclop, haveScalar))
+        return;
+
+    BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
+    BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
+    BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
+
+    size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
+    size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
+    size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
+    BinaryFunc copymask = getCopyMaskFunc(dsz);
+    Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
+
    AutoBuffer<uchar> _buf;
    uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
-    size_t bufesz = (cvtsrc1 ? wsz : 0) + (cvtsrc2 || haveScalar ? wsz : 0) + (cvtdst ? wsz : 0) + (haveMask ? dsz : 0);
-
-    _dst.create(src1.dims, src1.size, dtype);
-    Mat dst = _dst.getMat();
-
-    if( haveMask && reallocate )
-        dst = Scalar::all(0);
-
+    size_t bufesz = (cvtsrc1 ? wsz : 0) +
+                    (cvtsrc2 || haveScalar ? wsz : 0) +
+                    (cvtdst ? wsz : 0) +
+                    (haveMask ? dsz : 0);
    BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];

    if( !haveScalar )
@ -1476,7 +1720,7 @@ static BinaryFunc* getAbsDiffTab()
 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
          InputArray mask, int dtype )
 {
-    arithm_op(src1, src2, dst, mask, dtype, getAddTab() );
+    arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
 }

 void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
@ -1511,12 +1755,12 @@ void cv::subtract( InputArray src1, InputArray src2, OutputArray dst,
        }
    }
 #endif
-    arithm_op(src1, src2, dst, mask, dtype, getSubTab() );
+    arithm_op(src1, src2, dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
 }

 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
 {
-    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab());
+    arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
 }

 /****************************************************************************************\
@ -1847,19 +2091,20 @@ static BinaryFunc* getRecipTab()
 void cv::multiply(InputArray src1, InputArray src2,
                  OutputArray dst, double scale, int dtype)
 {
-    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(), true, &scale);
+    arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
+              true, &scale, scale == 1. ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
 }

 void cv::divide(InputArray src1, InputArray src2,
                OutputArray dst, double scale, int dtype)
 {
-    arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale);
+    arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
 }

 void cv::divide(double scale, InputArray src2,
                OutputArray dst, int dtype)
 {
-    arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale);
+    arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
 }

 /****************************************************************************************\
@ -2020,7 +2265,7 @@ void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
                      double beta, double gamma, OutputArray dst, int dtype )
 {
    double scalars[] = {alpha, beta, gamma};
-    arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars);
+    arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
 }


--- a/modules/core/src/copy.cpp
+++ b/modules/core/src/copy.cpp
@ -220,6 +220,21 @@ void Mat::copyTo( OutputArray _dst ) const
        return;
    }

+    if( _dst.isUMat() )
+    {
+        _dst.create( dims, size.p, type() );
+        UMat dst = _dst.getUMat();
+
+        size_t i, sz[CV_MAX_DIM], dstofs[CV_MAX_DIM], esz = elemSize();
+        for( i = 0; i < (size_t)dims; i++ )
+            sz[i] = size.p[i];
+        sz[dims-1] *= esz;
+        dst.ndoffset(dstofs);
+        dstofs[dims-1] *= esz;
+        dst.u->currAllocator->upload(dst.u, data, dims, sz, dstofs, dst.step.p, step.p);
+        return;
+    }
+
    if( dims <= 2 )
    {
        _dst.create( rows, cols, type() );
--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@ -1436,6 +1436,181 @@ Size _InputArray::size(int i) const
    }
 }

+
+int _InputArray::sizend(int* sz, int i) const
+{
+    int j, d=0, k = kind();
+
+    if( k == NONE )
+        ;
+    else if( k == MAT )
+    {
+        CV_Assert( i < 0 );
+        const Mat& m = *(const Mat*)obj;
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == UMAT )
+    {
+        CV_Assert( i < 0 );
+        const UMat& m = *(const UMat*)obj;
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == STD_VECTOR_MAT && i >= 0 )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        CV_Assert( i < (int)vv.size() );
+        const Mat& m = vv[i];
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else if( k == STD_VECTOR_UMAT && i >= 0 )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        CV_Assert( i < (int)vv.size() );
+        const UMat& m = vv[i];
+        d = m.dims;
+        if(sz)
+            for(j = 0; j < d; j++)
+                sz[j] = m.size.p[j];
+    }
+    else
+    {
+        Size sz2d = size(i);
+        d = 2;
+        if(sz)
+        {
+            sz[0] = sz2d.height;
+            sz[1] = sz2d.width;
+        }
+    }
+
+    return d;
+}
+
+
+bool _InputArray::sameSize(const _InputArray& arr) const
+{
+    int k1 = kind(), k2 = arr.kind();
+    Size sz1;
+
+    if( k1 == MAT )
+    {
+        const Mat* m = ((const Mat*)obj);
+        if( k2 == MAT )
+            return m->size == ((const Mat*)arr.obj)->size;
+        if( k2 == UMAT )
+            return m->size == ((const UMat*)arr.obj)->size;
+        if( m->dims > 2 )
+            return false;
+        sz1 = m->size();
+    }
+    else if( k1 == UMAT )
+    {
+        const UMat* m = ((const UMat*)obj);
+        if( k2 == MAT )
+            return m->size == ((const Mat*)arr.obj)->size;
+        if( k2 == UMAT )
+            return m->size == ((const UMat*)arr.obj)->size;
+        if( m->dims > 2 )
+            return false;
+        sz1 = m->size();
+    }
+    else
+        sz1 = size();
+    if( arr.dims() > 2 )
+        return false;
+    return sz1 == arr.size();
+}
+
+int _InputArray::dims(int i) const
+{
+    int k = kind();
+
+    if( k == MAT )
+    {
+        CV_Assert( i < 0 );
+        return ((const Mat*)obj)->dims;
+    }
+
+    if( k == EXPR )
+    {
+        CV_Assert( i < 0 );
+        return ((const MatExpr*)obj)->a.dims;
+    }
+
+    if( k == UMAT )
+    {
+        CV_Assert( i < 0 );
+        return ((const UMat*)obj)->dims;
+    }
+
+    if( k == MATX )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == STD_VECTOR )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == NONE )
+        return 0;
+
+    if( k == STD_VECTOR_VECTOR )
+    {
+        const std::vector<std::vector<uchar> >& vv = *(const std::vector<std::vector<uchar> >*)obj;
+        if( i < 0 )
+            return 1;
+        CV_Assert( i < (int)vv.size() );
+        return 2;
+    }
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        if( i < 0 )
+            return 1;
+        CV_Assert( i < (int)vv.size() );
+
+        return vv[i].dims;
+    }
+
+    if( k == OPENGL_BUFFER )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+
+    if( k == GPU_MAT )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+    
+    if( k == OCL_MAT )
+    {
+        return 2;
+    }
+    
+    CV_Assert( k == CUDA_MEM );
+    //if( k == CUDA_MEM )
+    {
+        CV_Assert( i < 0 );
+        return 2;
+    }
+}
+
 size_t _InputArray::total(int i) const
 {
    int k = kind();
@ -1570,6 +1745,61 @@ bool _InputArray::empty() const
        return ((const cuda::CudaMem*)obj)->empty();
 }

+bool _InputArray::isContinuous(int i) const
+{
+    int k = kind();
+
+    if( k == MAT )
+        return i < 0 ? ((const Mat*)obj)->isContinuous() : true;
+
+    if( k == UMAT )
+        return i < 0 ? ((const UMat*)obj)->isContinuous() : true;
+
+    if( k == EXPR || k == MATX || k == STD_VECTOR || k == NONE || k == STD_VECTOR_VECTOR)
+        return true;
+
+    if( k == STD_VECTOR_MAT )
+    {
+        const std::vector<Mat>& vv = *(const std::vector<Mat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isContinuous();
+    }
+
+    if( k == STD_VECTOR_UMAT )
+    {
+        const std::vector<UMat>& vv = *(const std::vector<UMat>*)obj;
+        CV_Assert((size_t)i < vv.size());
+        return vv[i].isContinuous();
+    }
+
+    CV_Error(CV_StsNotImplemented, "This method is not implemented for oclMat yet");
+    return false;
+}
+
+void _InputArray::copyTo(const _OutputArray& arr) const
+{
+    int k = kind();
+
+    if( k == NONE )
+        arr.release();
+    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    {
+        Mat m = getMat();
+        m.copyTo(arr);
+    }
+    else if( k == EXPR )
+    {
+        const MatExpr& e = *((MatExpr*)obj);
+        if( arr.kind() == MAT )
+            arr.getMatRef() = e;
+        else
+            Mat(e).copyTo(arr);
+    }
+    else if( k == UMAT )
+        ((UMat*)obj)->copyTo(arr);
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}

 bool _OutputArray::fixedSize() const
 {
@ -1899,6 +2129,12 @@ void _OutputArray::create(int dims, const int* sizes, int mtype, int i,
    CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
 }

+void _OutputArray::createSameSize(const _InputArray& arr, int mtype) const
+{
+    int sz[CV_MAX_DIM], d = arr.sizend(sz);
+    create(d, sz, mtype);
+}
+
 void _OutputArray::release() const
 {
    CV_Assert(!fixedSize());
@ -2010,6 +2246,23 @@ cuda::CudaMem& _OutputArray::getCudaMemRef() const
    return *(cuda::CudaMem*)obj;
 }

+void _OutputArray::setTo(const _InputArray& arr) const
+{
+    int k = kind();
+
+    if( k == NONE )
+        ;
+    else if( k == MAT || k == MATX || k == STD_VECTOR )
+    {
+        Mat m = getMat();
+        m.setTo(arr);
+    }
+    else if( k == UMAT )
+        ((UMat*)obj)->setTo(arr);
+    else
+        CV_Error(Error::StsNotImplemented, "");
+}
+
 static _InputOutputArray _none;
 InputOutputArray noArray() { return _none; }

--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -592,9 +592,16 @@ static void* initOpenCLAndLoad(const char* funcname)
    {
        if(!initialized)
        {
-            handle = dlopen("/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL", RTLD_LAZY);
+            const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
+            oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
+                "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
+            handle = dlopen(oclpath, RTLD_LAZY);
            initialized = true;
            g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
+            if( g_haveOpenCL )
+                fprintf(stderr, "Succesffuly loaded OpenCL v1.1+ runtime from %s\n", oclpath);
+            else
+                fprintf(stderr, "Failed to load OpenCL runtime\n");
        }
        if(!handle)
            return 0;
@ -1212,16 +1219,13 @@ namespace cv { namespace ocl {

 struct UMat2D
 {
-    UMat2D(const UMat& m, int accessFlags)
+    UMat2D(const UMat& m)
    {
-        CV_Assert(m.dims == 2);
-        data = (cl_mem)m.handle(accessFlags);
        offset = m.offset;
        step = m.step;
        rows = m.rows;
        cols = m.cols;
    }
-    cl_mem data;
    size_t offset;
    size_t step;
    int rows;
@ -1230,10 +1234,8 @@ struct UMat2D

 struct UMat3D
 {
-    UMat3D(const UMat& m, int accessFlags)
+    UMat3D(const UMat& m)
    {
-        CV_Assert(m.dims == 3);
-        data = (cl_mem)m.handle(accessFlags);
        offset = m.offset;
        step = m.step.p[1];
        slicestep = m.step.p[0];
@ -1241,7 +1243,6 @@ struct UMat3D
        rows = m.size.p[1];
        cols = m.size.p[2];
    }
-    cl_mem data;
    size_t offset;
    size_t slicestep;
    size_t step;
@ -1315,7 +1316,7 @@ void setUseOpenCL(bool flag)
    }
 }

-void finish()
+void finish2()
 {
    Queue::getDefault().finish();
 }
@ -1528,7 +1529,7 @@ String Device::OpenCLVersion() const
 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }

 String Device::driverVersion() const
-{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
+{ return p ? p->getStrProp(CL_DRIVER_VERSION) : String(); }

 int Device::type() const
 { return p ? p->getProp<cl_device_type, int>(CL_DEVICE_TYPE) : 0; }
@ -1705,14 +1706,14 @@ size_t Device::profilingTimerResolution() const

 const Device& Device::getDefault()
 {
-    const Context& ctx = Context::getDefault();
+    const Context2& ctx = Context2::getDefault();
    int idx = TLSData::get()->device;
    return ctx.device(idx);
 }

 /////////////////////////////////////////////////////////////////////////////////////////

-struct Context::Impl
+struct Context2::Impl
 {
    Impl(int dtype0)
    {
@ -1777,7 +1778,7 @@ struct Context::Impl
        devices.clear();
    }

-    Program getProg(const ProgramSource& src,
+    Program getProg(const ProgramSource2& src,
                    const String& buildflags, String& errmsg)
    {
        String prefix = Program::getPrefix(buildflags);
@ -1787,7 +1788,8 @@ struct Context::Impl
            return it->second;
        //String filename = format("%08x%08x_%08x%08x.clb2",
        Program prog(src, buildflags, errmsg);
-        phash.insert(std::pair<HashKey,Program>(k, prog));
+        if(prog.ptr())
+            phash.insert(std::pair<HashKey,Program>(k, prog));
        return prog;
    }

@ -1797,7 +1799,7 @@ struct Context::Impl
    std::vector<Device> devices;
    bool initialized;

-    typedef ProgramSource::hash_t hash_t;
+    typedef ProgramSource2::hash_t hash_t;

    struct HashKey
    {
@ -1812,18 +1814,18 @@ struct Context::Impl
 };


-Context::Context()
+Context2::Context2()
 {
    p = 0;
 }

-Context::Context(int dtype)
+Context2::Context2(int dtype)
 {
    p = 0;
    create(dtype);
 }

-bool Context::create(int dtype0)
+bool Context2::create(int dtype0)
 {
    if( !haveOpenCL() )
        return false;
@ -1838,19 +1840,19 @@ bool Context::create(int dtype0)
    return p != 0;
 }

-Context::~Context()
+Context2::~Context2()
 {
    p->release();
 }

-Context::Context(const Context& c)
+Context2::Context2(const Context2& c)
 {
    p = (Impl*)c.p;
    if(p)
        p->addref();
 }

-Context& Context::operator = (const Context& c)
+Context2& Context2::operator = (const Context2& c)
 {
    Impl* newp = (Impl*)c.p;
    if(newp)
@ -1861,30 +1863,30 @@ Context& Context::operator = (const Context& c)
    return *this;
 }

-void* Context::ptr() const
+void* Context2::ptr() const
 {
    return p->handle;
 }

-size_t Context::ndevices() const
+size_t Context2::ndevices() const
 {
    return p ? p->devices.size() : 0;
 }

-const Device& Context::device(size_t idx) const
+const Device& Context2::device(size_t idx) const
 {
    static Device dummy;
    return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
 }

-Context& Context::getDefault()
+Context2& Context2::getDefault()
 {
-    static Context ctx;
+    static Context2 ctx;
    if( !ctx.p && haveOpenCL() )
    {
-        // do not create new Context right away.
+        // do not create new Context2 right away.
        // First, try to retrieve existing context of the same type.
-        // In its turn, Platform::getContext() may call Context::create()
+        // In its turn, Platform::getContext() may call Context2::create()
        // if there is no such context.
        ctx.create(Device::TYPE_ACCELERATOR);
        if(!ctx.p)
@ -1898,7 +1900,7 @@ Context& Context::getDefault()
    return ctx;
 }

-Program Context::getProg(const ProgramSource& prog,
+Program Context2::getProg(const ProgramSource2& prog,
                         const String& buildopts, String& errmsg)
 {
    return p ? p->getProg(prog, buildopts, errmsg) : Program();
@ -1906,14 +1908,14 @@ Program Context::getProg(const ProgramSource& prog,

 struct Queue::Impl
 {
-    Impl(const Context& c, const Device& d)
+    Impl(const Context2& c, const Device& d)
    {
        refcount = 1;
-        const Context* pc = &c;
+        const Context2* pc = &c;
        cl_context ch = (cl_context)pc->ptr();
        if( !ch )
        {
-            pc = &Context::getDefault();
+            pc = &Context2::getDefault();
            ch = (cl_context)pc->ptr();
        }
        cl_device_id dh = (cl_device_id)d.ptr();
@ -1943,7 +1945,7 @@ Queue::Queue()
    p = 0;
 }

-Queue::Queue(const Context& c, const Device& d)
+Queue::Queue(const Context2& c, const Device& d)
 {
    p = 0;
    create(c, d);
@ -1973,7 +1975,7 @@ Queue::~Queue()
        p->release();
 }

-bool Queue::create(const Context& c, const Device& d)
+bool Queue::create(const Context2& c, const Device& d)
 {
    if(p)
        p->release();
@ -1996,7 +1998,7 @@ Queue& Queue::getDefault()
 {
    Queue& q = TLSData::get()->oclQueue;
    if( !q.p )
-        q.create(Context::getDefault());
+        q.create(Context2::getDefault());
    return q;
 }

@ -2008,15 +2010,20 @@ static cl_command_queue getQueue(const Queue& q)
    return qq;
 }

-KernelArg::KernelArg(int _flags, UMat* _m, void* _obj, size_t _sz)
-    : flags(_flags), m(_m), obj(_obj), sz(_sz)
+KernelArg::KernelArg()
+    : flags(0), m(0), obj(0), sz(0), wscale(1)
+{
+}
+
+KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, const void* _obj, size_t _sz)
+    : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale)
 {
 }

 KernelArg KernelArg::Constant(const Mat& m)
 {
    CV_Assert(m.isContinuous());
-    return KernelArg(CONSTANT, 0, m.data, m.total()*m.elemSize());
+    return KernelArg(CONSTANT, 0, 1, m.data, m.total()*m.elemSize());
 }


@ -2099,8 +2106,8 @@ Kernel::Kernel(const char* kname, const Program& prog)
    create(kname, prog);
 }

-Kernel::Kernel(const char* kname, const ProgramSource& src,
-               const String& buildopts, String& errmsg)
+Kernel::Kernel(const char* kname, const ProgramSource2& src,
+               const String& buildopts, String* errmsg)
 {
    p = 0;
    create(kname, src, buildopts, errmsg);
@ -2143,15 +2150,17 @@ bool Kernel::create(const char* kname, const Program& prog)
    return p != 0;
 }

-bool Kernel::create(const char* kname, const ProgramSource& src,
-                    const String& buildopts, String& errmsg)
+bool Kernel::create(const char* kname, const ProgramSource2& src,
+                    const String& buildopts, String* errmsg)
 {
    if(p)
    {
        p->release();
        p = 0;
    }
-    const Program& prog = Context::getDefault().getProg(src, buildopts, errmsg);
+    String tempmsg;
+    if( !errmsg ) errmsg = &tempmsg;
+    const Program& prog = Context2::getDefault().getProg(src, buildopts, *errmsg);
    return create(kname, prog);
 }

@ -2160,55 +2169,91 @@ void* Kernel::ptr() const
    return p ? p->handle : 0;
 }

-void Kernel::set(int i, const void* value, size_t sz)
+bool Kernel::empty() const
 {
-    CV_Assert( p && clSetKernelArg(p->handle, (cl_uint)i, sz, value) >= 0 );
-    if( i == 0 )
-        p->cleanupUMats();
+    return ptr() == 0;
 }

-void Kernel::set(int i, const UMat& m)
+int Kernel::set(int i, const void* value, size_t sz)
 {
-    set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
-}
-
-void Kernel::set(int i, const KernelArg& arg)
-{
-    CV_Assert( p && p->handle );
+    CV_Assert(i >= 0);
    if( i == 0 )
        p->cleanupUMats();
+    if( !p || !p->handle || clSetKernelArg(p->handle, (cl_uint)i, sz, value) < 0 )
+        return -1;
+    return i+1;
+}
+
+int Kernel::set(int i, const UMat& m)
+{
+    return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
+}
+
+int Kernel::set(int i, const KernelArg& arg)
+{
+    CV_Assert( i >= 0 );
+    if( i == 0 )
+        p->cleanupUMats();
+    if( !p || !p->handle )
+        return -1;
    if( arg.m )
    {
        int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
                          ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
+        cl_mem h = (cl_mem)arg.m->handle(accessFlags);
+
        if( arg.m->dims <= 2 )
        {
-            UMat2D u2d(*arg.m, accessFlags);
-            clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d), &u2d);
+            UMat2D u2d(*arg.m);
+            clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
+            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
+            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
+            i += 3;
+
+            if( !(arg.flags & KernelArg::NO_SIZE) )
+            {
+                int cols = u2d.cols*arg.wscale;
+                clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
+                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.cols), &cols);
+                i += 2;
+            }
        }
        else
        {
-            UMat3D u3d(*arg.m, accessFlags);
-            clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d), &u3d);
+            UMat3D u3d(*arg.m);
+            clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
+            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
+            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
+            clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
+            i += 4;
+            if( !(arg.flags & KernelArg::NO_SIZE) )
+            {
+                int cols = u3d.cols*arg.wscale;
+                clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows);
+                clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
+                clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
+                i += 3;
+            }
        }
        p->addUMat(*arg.m);
+        return i;
    }
-    else
-    {
-        clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
-    }
+    clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
+    return i+1;
 }


-void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsize[],
+bool Kernel::run(int dims, size_t globalsize[], size_t localsize[],
                 bool sync, const Queue& q)
 {
-    CV_Assert(p && p->handle && p->e == 0);
+    if(!p || !p->handle || p->e != 0)
+        return false;
    cl_command_queue qq = getQueue(q);
-    clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
-                           offset, globalsize, localsize, 0, 0,
-                           sync ? 0 : &p->e);
-    if( sync )
+    size_t offset[CV_MAX_DIM] = {0};
+    cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
+                                           offset, globalsize, localsize, 0, 0,
+                                           sync ? 0 : &p->e);
+    if( sync || retval < 0 )
    {
        clFinish(qq);
        p->cleanupUMats();
@ -2218,14 +2263,17 @@ void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsiz
        p->addref();
        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
    }
+    return retval >= 0;
 }

-void Kernel::runTask(bool sync, const Queue& q)
+bool Kernel::runTask(bool sync, const Queue& q)
 {
-    CV_Assert(p && p->handle && p->e == 0);
+    if(!p || !p->handle || p->e != 0)
+        return false;
+
    cl_command_queue qq = getQueue(q);
-    clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
-    if( sync )
+    cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
+    if( sync || retval < 0 )
    {
        clFinish(qq);
        p->cleanupUMats();
@ -2235,6 +2283,7 @@ void Kernel::runTask(bool sync, const Queue& q)
        p->addref();
        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
    }
+    return retval >= 0;
 }


@ -2273,11 +2322,11 @@ size_t Kernel::localMemSize() const

 struct Program::Impl
 {
-    Impl(const ProgramSource& _src,
+    Impl(const ProgramSource2& _src,
         const String& _buildflags, String& errmsg)
    {
        refcount = 1;
-        const Context& ctx = Context::getDefault();
+        const Context2& ctx = Context2::getDefault();
        src = _src;
        buildflags = _buildflags;
        const String& srcstr = src.source();
@ -2293,17 +2342,20 @@ struct Program::Impl
            void** deviceList = deviceListBuf;
            for( i = 0; i < n; i++ )
                deviceList[i] = ctx.device(i).ptr();
+            printf("Building the OpenCL program ...\n");
            retval = clBuildProgram(handle, n,
                                    (const cl_device_id*)deviceList,
                                    buildflags.c_str(), 0, 0);
            if( retval == CL_BUILD_PROGRAM_FAILURE )
            {
-                char buf[1024];
+                char buf[1<<16];
                size_t retsz = 0;
                clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG,
                                      sizeof(buf)-16, buf, &retsz);
                errmsg = String(buf);
+                CV_Error_(Error::StsAssert, ("OpenCL program can not be built: %s", errmsg.c_str()));
            }
+            CV_Assert(retval >= 0);
        }
    }

@ -2315,7 +2367,7 @@ struct Program::Impl
        if(_buf.empty())
            return;
        String prefix0 = Program::getPrefix(buildflags);
-        const Context& ctx = Context::getDefault();
+        const Context2& ctx = Context2::getDefault();
        const Device& dev = Device::getDefault();
        const char* pos0 = _buf.c_str();
        const char* pos1 = strchr(pos0, '\n');
@ -2366,7 +2418,7 @@ struct Program::Impl

    IMPLEMENT_REFCOUNTABLE();

-    ProgramSource src;
+    ProgramSource2 src;
    String buildflags;
    cl_program handle;
 };
@ -2374,7 +2426,7 @@ struct Program::Impl

 Program::Program() { p = 0; }

-Program::Program(const ProgramSource& src,
+Program::Program(const ProgramSource2& src,
        const String& buildflags, String& errmsg)
 {
    p = 0;
@ -2405,7 +2457,7 @@ Program::~Program()
        p->release();
 }

-bool Program::create(const ProgramSource& src,
+bool Program::create(const ProgramSource2& src,
            const String& buildflags, String& errmsg)
 {
    if(p)
@ -2419,9 +2471,9 @@ bool Program::create(const ProgramSource& src,
    return p != 0;
 }

-const ProgramSource& Program::source() const
+const ProgramSource2& Program::source() const
 {
-    static ProgramSource dummy;
+    static ProgramSource2 dummy;
    return p ? p->src : dummy;
 }

@ -2455,7 +2507,7 @@ String Program::getPrefix() const

 String Program::getPrefix(const String& buildflags)
 {
-    const Context& ctx = Context::getDefault();
+    const Context2& ctx = Context2::getDefault();
    const Device& dev = ctx.device(0);
    return format("name=%s\ndriver=%s\nbuildflags=%s\n",
                  dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
@ -2463,7 +2515,7 @@ String Program::getPrefix(const String& buildflags)

 ////////////////////////////////////////////////////////////////////////////////////////

-struct ProgramSource::Impl
+struct ProgramSource2::Impl
 {
    Impl(const char* _src)
    {
@ -2482,39 +2534,39 @@ struct ProgramSource::Impl

    IMPLEMENT_REFCOUNTABLE();
    String src;
-    ProgramSource::hash_t h;
+    ProgramSource2::hash_t h;
 };


-ProgramSource::ProgramSource()
+ProgramSource2::ProgramSource2()
 {
    p = 0;
 }

-ProgramSource::ProgramSource(const char* prog)
+ProgramSource2::ProgramSource2(const char* prog)
 {
    p = new Impl(prog);
 }

-ProgramSource::ProgramSource(const String& prog)
+ProgramSource2::ProgramSource2(const String& prog)
 {
    p = new Impl(prog);
 }

-ProgramSource::~ProgramSource()
+ProgramSource2::~ProgramSource2()
 {
    if(p)
        p->release();
 }

-ProgramSource::ProgramSource(const ProgramSource& prog)
+ProgramSource2::ProgramSource2(const ProgramSource2& prog)
 {
    p = prog.p;
    if(p)
        p->addref();
 }

-ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
+ProgramSource2& ProgramSource2::operator = (const ProgramSource2& prog)
 {
    Impl* newp = (Impl*)prog.p;
    if(newp)
@ -2525,13 +2577,13 @@ ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
    return *this;
 }

-const String& ProgramSource::source() const
+const String& ProgramSource2::source() const
 {
    static String dummy;
    return p ? p->src : dummy;
 }

-ProgramSource::hash_t ProgramSource::hash() const
+ProgramSource2::hash_t ProgramSource2::hash() const
 {
    return p ? p->h : 0;
 }
@ -2551,7 +2603,7 @@ public:
        return u;
    }

-    void getBestFlags(const Context& ctx, int& createFlags, int& flags0) const
+    void getBestFlags(const Context2& ctx, int& createFlags, int& flags0) const
    {
        const Device& dev = ctx.device(0);
        createFlags = CL_MEM_READ_WRITE;
@ -2574,7 +2626,7 @@ public:
            total *= sizes[i];
        }

-        Context& ctx = Context::getDefault();
+        Context2& ctx = Context2::getDefault();
        int createFlags = 0, flags0 = 0;
        getBestFlags(ctx, createFlags, flags0);

@ -2603,7 +2655,7 @@ public:
        if(u->handle == 0)
        {
            CV_Assert(u->origdata != 0);
-            Context& ctx = Context::getDefault();
+            Context2& ctx = Context2::getDefault();
            int createFlags = 0, flags0 = 0;
            getBestFlags(ctx, createFlags, flags0);

@ -2848,7 +2900,6 @@ public:
                            new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
                            new_dststep[0], new_dststep[1], dstptr, 0, 0, 0) >= 0 );
        }
-        clFinish(q);
    }

    void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
@ -2890,6 +2941,9 @@ public:

        if( iscontinuous )
        {
+            int crc = 0;
+            for( size_t i = 0; i < total; i++ )
+                crc ^= ((uchar*)srcptr)[i];
            CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
                CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) >= 0 );
        }
@ -2949,10 +3003,11 @@ public:
        }
        else
        {
-            CV_Assert( clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
+            cl_int retval;
+            CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
                                               new_srcofs, new_dstofs, new_sz,
                                               new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1],
-                                               0, 0, 0) >= 0 );
+                                               0, 0, 0)) >= 0 );
        }

        dst->markHostCopyObsolete(true);
@ -2969,4 +3024,61 @@ MatAllocator* getOpenCLAllocator()
    return &allocator;
 }

+const char* typeToStr(int t)
+{
+    static const char* tab[]=
+    {
+        "uchar", "uchar2", "uchar3", "uchar4",
+        "char", "char2", "char3", "char4",
+        "ushort", "ushort2", "ushort3", "ushort4",
+        "short", "short2", "short3", "short4",
+        "int", "int2", "int3", "int4",
+        "float", "float2", "float3", "float4",
+        "double", "double2", "double3", "double4",
+        "?", "?", "?", "?"
+    };
+    int cn = CV_MAT_CN(t);
+    return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
+}
+
+const char* memopTypeToStr(int t)
+{
+    static const char* tab[]=
+    {
+        "uchar", "uchar2", "uchar3", "uchar4",
+        "uchar", "uchar2", "uchar3", "uchar4",
+        "ushort", "ushort2", "ushort3", "ushort4",
+        "ushort", "ushort2", "ushort3", "ushort4",
+        "int", "int2", "int3", "int4",
+        "int", "int2", "int3", "int4",
+        "long", "long2", "long3", "long4",
+        "?", "?", "?", "?"
+    };
+    int cn = CV_MAT_CN(t);
+    return cn >= 4 ? "?" : tab[CV_MAT_DEPTH(t)*4 + cn-1];
+}
+
+const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
+{
+    if( sdepth == ddepth )
+        return "noconvert";
+    const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
+    if( ddepth >= CV_32F ||
+        (ddepth == CV_32S && sdepth < CV_32S) ||
+        (ddepth == CV_16S && sdepth <= CV_8S) ||
+        (ddepth == CV_16U && sdepth == CV_8U))
+    {
+        sprintf(buf, "convert_%s", typestr);
+    }
+    else if( sdepth >= CV_32F )
+    {
+        sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
+    }
+    else
+    {
+        sprintf(buf, "convert_%s_sat", typestr);
+    }
+    return buf;
+}
+
 }}
--- a/modules/core/src/opencl/arithm.cl
+++ b/modules/core/src/opencl/arithm.cl
@ -0,0 +1,307 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+  Usage:
+     after compiling this program user gets a single kernel called KF.
+     the following flags should be passed:
+     1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP"
+     2) the actual operation performed, one of "-D OP_...", see below the list of operations.
+     2a) "-D dstDepth=<destination depth> [-D cn=<num channels]"
+         for some operations, like min/max/and/or/xor it's enough
+     2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth>
+          -D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations
+*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#define CV_32S 4
+#define CV_32F 5
+
+#define dstelem *(dstT*)(dstptr + dst_index)
+#define noconvert(x) x
+
+#ifndef workT
+
+    #define srcT1 dstT
+    #define srcT2 dstT
+    #define workT dstT
+    #define srcelem1 *(dstT*)(srcptr1 + src1_index)
+    #define srcelem2 *(dstT*)(srcptr2 + src2_index)
+    #define convertToDT noconvert
+
+#else
+
+    #define srcelem1 convertToWT1(*(srcT1*)(srcptr1 + src1_index))
+    #define srcelem2 convertToWT2(*(srcT2*)(srcptr2 + src2_index))
+
+#endif
+
+#define EXTRA_PARAMS
+
+#if defined OP_ADD_SAT
+#define PROCESS_ELEM dstelem = add_sat(srcelem1, srcelem2)
+
+#elif defined OP_ADD
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2)
+
+#elif defined OP_SUB_SAT
+#define PROCESS_ELEM dstelem = sub_sat(srcelem1, srcelem2)
+
+#elif defined OP_SUB
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2)
+
+#elif defined OP_RSUB_SAT
+#define PROCESS_ELEM dstelem = sub_sat(srcelem2, srcelem1)
+
+#elif defined OP_RSUB
+#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1)
+
+#elif defined OP_ABSDIFF
+#define PROCESS_ELEM dstelem = abs_diff(srcelem1, srcelem2)
+
+#elif defined OP_AND
+#define PROCESS_ELEM dstelem = srcelem1 & srcelem2
+
+#elif defined OP_OR
+#define PROCESS_ELEM dstelem = srcelem1 | srcelem2
+
+#elif defined OP_XOR
+#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2
+
+#elif defined OP_NOT
+#define PROCESS_ELEM dstelem = ~srcelem1
+
+#elif defined OP_MIN
+#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2)
+
+#elif defined OP_MAX
+#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2)
+
+#elif defined OP_MUL
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2)
+
+#elif defined OP_MUL_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale)
+
+#elif defined OP_DIV
+#define PROCESS_ELEM \
+        workT e2 = srcelem2, zero = (workT)(0); \
+        dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero)
+
+#elif defined OP_DIV_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM \
+        workT e2 = srcelem2, zero = (workT)(0); \
+        dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero)
+
+#elif defined OP_RECIP_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT scale
+#define PROCESS_ELEM \
+        workT e1 = srcelem1, zero = (workT)(0); \
+        dstelem = convertToDT(e1 != zero ? scale / e1 : zero)
+
+#elif defined OP_ADDW
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma)
+
+#elif defined OP_MAG
+#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2)
+
+#elif defined OP_PHASE_RADIANS
+#define PROCESS_ELEM \
+        workT tmp = atan2(srcelem2, srcelem1); \
+        if(tmp < 0) tmp += 6.283185307179586232; \
+        dstelem = tmp
+
+#elif defined OP_PHASE_DEGREES
+    #define PROCESS_ELEM \
+    workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \
+    if(tmp < 0) tmp += 360; \
+    dstelem = tmp
+
+#elif defined OP_EXP
+#define PROCESS_ELEM dstelem = exp(srcelem1)
+
+#elif defined OP_SQRT
+#define PROCESS_ELEM dstelem = sqrt(srcelem1)
+
+#elif defined OP_LOG
+#define PROCESS_ELEM dstelem = log(abs(srcelem1))
+
+#elif defined OP_CMP
+#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0)
+
+#elif defined OP_CONVERT
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1)
+
+#elif defined OP_CONVERT_SCALE
+#undef EXTRA_PARAMS
+#define EXTRA_PARAMS , workT alpha, workT beta
+#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta)
+
+#else
+#error "unknown op type"
+#endif
+
+#if defined UNARY_OP || defined MASK_UNARY_OP
+#undef srcelem2
+#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \
+    defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \
+    defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX
+    #undef EXTRA_PARAMS
+    #define EXTRA_PARAMS , workT srcelem2
+#endif
+#endif
+
+#if defined BINARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+        int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+        PROCESS_ELEM;
+        //printf("(x=%d, y=%d). %d, %d, %d\n", x, y, (int)srcelem1, (int)srcelem2, (int)dstelem);
+    }
+}
+
+#elif defined MASK_BINARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* srcptr2, int srcstep2, int srcoffset2,
+                 __global const uchar* mask, int maskstep, int maskoffset,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+            int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2);
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+            PROCESS_ELEM;
+        }
+    }
+}
+
+#elif defined UNARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+        PROCESS_ELEM;
+    }
+}
+
+#elif defined MASK_UNARY_OP
+
+__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1,
+                 __global const uchar* mask, int maskstep, int maskoffset,
+                 __global uchar* dstptr, int dststep, int dstoffset,
+                 int rows, int cols EXTRA_PARAMS )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1);
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+
+            PROCESS_ELEM;
+        }
+    }
+}
+
+#else
+
+#error "Unknown operation type"
+
+#endif
+
+
+
+
--- a/modules/core/src/opencl/copyset.cl
+++ b/modules/core/src/opencl/copyset.cl
@ -0,0 +1,74 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset,
+                      __global uchar* dstptr, int dststep, int dstoffset,
+                      int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+            *(dstT*)(dstptr + dst_index) = value;
+        }
+    }
+}
+
+__kernel void set(__global uchar* dstptr, int dststep, int dstoffset,
+                  int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+        *(dstT*)(dstptr + dst_index) = value;
+    }
+}
+
--- a/modules/core/src/opencl/mulspectrums.cl
+++ b/modules/core/src/opencl/mulspectrums.cl
@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the uintel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business uinterruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+typedef float2 cfloat;
+inline cfloat cmulf(cfloat a, cfloat b)
+{
+    return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
+}
+
+inline cfloat conjf(cfloat a)
+{
+    return (cfloat)( a.x, - a.y );
+}
+
+__kernel void
+mulAndScaleSpectrumsKernel(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
+    uint mstep
+)
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], b[idx]);
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
+__kernel void
+mulAndScaleSpectrumsKernel_CONJ(
+    __global const cfloat* a,
+    __global const cfloat* b,
+    float scale,
+    __global cfloat* dst,
+    uint cols,
+    uint rows,
+    uint mstep
+)
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint idx = mad24(y, mstep / sizeof(cfloat), x);
+    if (x < cols && y < rows)
+    {
+        cfloat v = cmulf(a[idx], conjf(b[idx]));
+        dst[idx] = (cfloat)( v.x * scale, v.y * scale );
+    }
+}
--- a/modules/core/src/opencl/polarcart.cl
+++ b/modules/core/src/opencl/polarcart.cl
@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset,
+                          __global uchar* dstptr, int dststep, int dstoffset,
+                          int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int mask_index = mad24(y, maskstep, x + maskoffset);
+        if( mask[mask_index] )
+        {
+            int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+            *(dstT*)(dstptr + dst_index) = value;
+        }
+    }
+}
+
+__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset,
+                          int rows, int cols, dstT value )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int dst_index  = mad24(y, dststep, x*sizeof(dstT) + dstoffset);
+        *(dstT*)(dstptr + dst_index) = value;
+    }
+}
--- a/modules/core/src/opencl/reductions.cl
+++ b/modules/core/src/opencl/reductions.cl
@ -0,0 +1,104 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#if FUNC_SUM
+#define FUNC(a, b) b += a;
+#elif FUNC_ABS_SUM
+#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a;
+#elif FUNC_SQR_SUM
+#define FUNC(a, b) b += a * a;
+#else
+#error No sum function
+#endif
+
+/**************************************Array buffer SUM**************************************/
+
+__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+                                __global srcT *src, __global dstT *dst)
+{
+   unsigned int lid = get_local_id(0);
+   unsigned int gid = get_group_id(0);
+   unsigned int id = get_global_id(0);
+   unsigned int idx = offset + id + (id / cols) * invalid_cols;
+
+   __local dstT localmem_sum[128];
+   dstT sum = (dstT)(0), temp;
+
+   for (int grainSize = groupnum << 8; id < elemnum; id += grainSize)
+   {
+       idx = offset + id + (id / cols) * invalid_cols;
+       temp = convertToDstT(src[idx]);
+       FUNC(temp, sum);
+   }
+
+   if (lid > 127)
+       localmem_sum[lid - 128] = sum;
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   if (lid < 128)
+       localmem_sum[lid] = sum + localmem_sum[lid];
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
+   {
+       if (lid < lsize)
+       {
+           int lid2 = lsize + lid;
+           localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2];
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+   }
+
+   if (lid == 0)
+       dst[gid] = localmem_sum[0];
+}
--- a/modules/core/src/precomp.hpp
+++ b/modules/core/src/precomp.hpp
@ -67,6 +67,8 @@
 #define GET_OPTIMIZED(func) (func)
 #endif

+#include "opencl_kernels.hpp"
+
 namespace cv
 {

@ -205,13 +207,30 @@ enum { BLOCK_SIZE = 1024 };

 inline bool checkScalar(const Mat& sc, int atype, int sckind, int akind)
 {
-    if( sc.dims > 2 || (sc.cols != 1 && sc.rows != 1) || !sc.isContinuous() )
+    if( sc.dims > 2 || !sc.isContinuous() )
+        return false;
+    Size sz = sc.size();
+    if(sz.width != 1 && sz.height != 1)
        return false;
    int cn = CV_MAT_CN(atype);
    if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
        return false;
-    return sc.size() == Size(1, 1) || sc.size() == Size(1, cn) || sc.size() == Size(cn, 1) ||
-           (sc.size() == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
+    return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
+           (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
+}
+
+inline bool checkScalar(InputArray sc, int atype, int sckind, int akind)
+{
+    if( sc.dims() > 2 || !sc.isContinuous() )
+        return false;
+    Size sz = sc.size();
+    if(sz.width != 1 && sz.height != 1)
+        return false;
+    int cn = CV_MAT_CN(atype);
+    if( akind == _InputArray::MATX && sckind != _InputArray::MATX )
+        return false;
+    return sz == Size(1, 1) || sz == Size(1, cn) || sz == Size(cn, 1) ||
+           (sz == Size(1, 4) && sc.type() == CV_64F && cn <= 4);
 }

 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize );
@ -227,7 +246,10 @@ struct TLSData
    static TLSData* get();
 };

-namespace ocl { MatAllocator* getOpenCLAllocator(); }
+namespace ocl
+{
+    MatAllocator* getOpenCLAllocator();
+}

 }

--- a/modules/core/src/umatrix.cpp
+++ b/modules/core/src/umatrix.cpp
@ -197,6 +197,7 @@ UMat Mat::getUMat(int accessFlags) const
    if(!u)
        return hdr;
    UMat::getStdAllocator()->allocate(u, accessFlags);
+    hdr.flags = flags;
    setSize(hdr, dims, size.p, step.p);
    finalizeHdr(hdr);
    hdr.u = u;
@ -548,7 +549,8 @@ Mat UMat::getMat(int accessFlags) const
    CV_Assert(u->data != 0);
    Mat hdr(dims, size.p, type(), u->data + offset, step.p);
    hdr.u = u;
-    hdr.datastart = hdr.data = u->data;
+    hdr.datastart = u->data;
+    hdr.data = hdr.datastart + offset;
    hdr.datalimit = hdr.dataend = u->data + u->size;
    CV_XADD(&hdr.u->refcount, 1);
    return hdr;
@ -617,7 +619,7 @@ void UMat::copyTo(OutputArray _dst) const
        void* dsthandle = dst.handle(ACCESS_WRITE);
        if( srchandle == dsthandle && dst.offset == offset )
            return;
-        ndoffset(dstofs);
+        dst.ndoffset(dstofs);
        CV_Assert(u->currAllocator == dst.u->currAllocator);
        u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
    }
@ -633,6 +635,50 @@ void UMat::convertTo(OutputArray, int, double, double) const
    CV_Error(Error::StsNotImplemented, "");
 }

+UMat& UMat::setTo(InputArray _value, InputArray _mask)
+{
+    bool haveMask = !_mask.empty();
+    int t = type(), cn = CV_MAT_CN(t);
+    if( dims <= 2 && cn <= 4 && ocl::useOpenCL() )
+    {
+        Mat value = _value.getMat();
+        CV_Assert( checkScalar(value, type(), _value.kind(), _InputArray::UMAT) );
+        double buf[4];
+        convertAndUnrollScalar(value, t, (uchar*)buf, 1);
+
+        char opts[1024];
+        sprintf(opts, "-D dstT=%s", ocl::memopTypeToStr(t));
+
+        ocl::Kernel setK(haveMask ? "setMask" : "set", ocl::core::copyset_oclsrc, opts);
+        if( !setK.empty() )
+        {
+            ocl::KernelArg scalararg(0, 0, 0, buf, CV_ELEM_SIZE(t));
+            UMat mask;
+
+            if( haveMask )
+            {
+                mask = _mask.getUMat();
+                CV_Assert( mask.size() == size() && mask.type() == CV_8U );
+                ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
+                ocl::KernelArg dstarg = ocl::KernelArg::ReadWrite(*this);
+                setK.args(maskarg, dstarg, scalararg);
+            }
+            else
+            {
+                ocl::KernelArg dstarg = ocl::KernelArg::WriteOnly(*this);
+                setK.args(dstarg, scalararg);
+            }
+
+            size_t globalsize[] = { cols, rows };
+            if( setK.run(2, globalsize, 0, false) )
+                return *this;
+        }
+    }
+    Mat m = getMat(haveMask ? ACCESS_RW : ACCESS_WRITE);
+    m.setTo(_value, _mask);
+    return *this;
+}
+
 UMat& UMat::operator = (const Scalar&)
 {
    CV_Error(Error::StsNotImplemented, "");
--- a/modules/core/test/test_umat.cpp
+++ b/modules/core/test/test_umat.cpp
@ -91,11 +91,11 @@ bool CV_UMatTest::TestUMat()
 {
    try
    {
-        Mat a(100, 100, CV_16S), b;
+        Mat a(100, 100, CV_16SC2), b, c;
        randu(a, Scalar::all(-100), Scalar::all(100));
-        Rect roi(1, 3, 10, 20);
-        Mat ra(a, roi), rb;
-        UMat ua, ura;
+        Rect roi(1, 3, 5, 4);
+        Mat ra(a, roi), rb, rc, rc0;
+        UMat ua, ura, ub, urb, uc, urc;
        a.copyTo(ua);
        ua.copyTo(b);
        CHECK_DIFF(a, b);
@ -112,6 +112,71 @@ bool CV_UMatTest::TestUMat()
        }
        ra.copyTo(rb);
        CHECK_DIFF(ra, rb);
+
+        b = a.clone();
+        ra = a(roi);
+        rb = b(roi);
+        randu(b, Scalar::all(-100), Scalar::all(100));
+        b.copyTo(ub);
+        urb = ub(roi);
+
+        /*std::cout << "==============================================\nbefore op (CPU):\n";
+        std::cout << "ra: " << ra << std::endl;
+        std::cout << "rb: " << rb << std::endl;*/
+
+        ra.copyTo(ura);
+        rb.copyTo(urb);
+        ra.release();
+        rb.release();
+        ura.copyTo(ra);
+        urb.copyTo(rb);
+
+        /*std::cout << "==============================================\nbefore op (GPU):\n";
+        std::cout << "ra: " << ra << std::endl;
+        std::cout << "rb: " << rb << std::endl;*/
+
+        cv::max(ra, rb, rc);
+        cv::max(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        /*std::cout << "==============================================\nafter op:\n";
+        std::cout << "rc: " << rc << std::endl;
+        std::cout << "rc0: " << rc0 << std::endl;*/
+
+        CHECK_DIFF(rc0, rc);
+
+        {
+        UMat tmp = rc0.getUMat(ACCESS_WRITE);
+        cv::max(ura, urb, tmp);
+        }
+        CHECK_DIFF(rc0, rc);
+
+        ura.copyTo(urc);
+        cv::max(urc, urb, urc);
+        urc.copyTo(rc0);
+        CHECK_DIFF(rc0, rc);
+
+        rc = ra ^ rb;
+        cv::bitwise_xor(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        /*std::cout << "==============================================\nafter op:\n";
+        std::cout << "ra: " << rc0 << std::endl;
+        std::cout << "rc: " << rc << std::endl;*/
+
+        CHECK_DIFF(rc0, rc);
+
+        rc = ra + rb;
+        cv::add(ura, urb, urc);
+        urc.copyTo(rc0);
+
+        CHECK_DIFF(rc0, rc);
+
+        cv::subtract(ra, Scalar::all(5), rc);
+        cv::subtract(ura, Scalar::all(5), urc);
+        urc.copyTo(rc0);
+
+        CHECK_DIFF(rc0, rc);
    }
    catch (const test_excep& e)
    {
--- a/modules/highgui/include/opencv2/highgui.hpp
+++ b/modules/highgui/include/opencv2/highgui.hpp
@ -511,9 +511,10 @@ public:
    CV_WRAP virtual void release();

    CV_WRAP virtual bool grab();
-    CV_WRAP virtual bool retrieve(CV_OUT Mat& image, int flag = 0);
+    CV_WRAP virtual bool retrieve(OutputArray image, int flag = 0);
    virtual VideoCapture& operator >> (CV_OUT Mat& image);
-    CV_WRAP virtual bool read(CV_OUT Mat& image);
+    virtual VideoCapture& operator >> (CV_OUT UMat& image);
+    CV_WRAP virtual bool read(OutputArray image);

    CV_WRAP virtual bool set(int propId, double value);
    CV_WRAP virtual double get(int propId);
--- a/modules/highgui/src/cap.cpp
+++ b/modules/highgui/src/cap.cpp
@ -515,7 +515,7 @@ bool VideoCapture::grab()
    return cvGrabFrame(cap) != 0;
 }

-bool VideoCapture::retrieve(Mat& image, int channel)
+bool VideoCapture::retrieve(OutputArray image, int channel)
 {
    IplImage* _img = cvRetrieveFrame(cap, channel);
    if( !_img )
@ -533,7 +533,7 @@ bool VideoCapture::retrieve(Mat& image, int channel)
    return true;
 }

-bool VideoCapture::read(Mat& image)
+bool VideoCapture::read(OutputArray image)
 {
    if(grab())
        retrieve(image);
@ -548,6 +548,12 @@ VideoCapture& VideoCapture::operator >> (Mat& image)
    return *this;
 }

+VideoCapture& VideoCapture::operator >> (UMat& image)
+{
+    read(image);
+    return *this;
+}
+
 bool VideoCapture::set(int propId, double value)
 {
    return cvSetCaptureProperty(cap, propId, value) != 0;
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@ -2687,6 +2687,124 @@ struct mRGBA2RGBA
    }
 };

+
+static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
+{
+    bool ok = true;
+    UMat src = _src.getUMat(), dst;
+    Size sz = src.size(), dstSz = sz;
+    int scn = src.channels(), depth = src.depth(), bidx, dtype;
+    size_t globalsize[] = { src.cols, src.rows };
+    ocl::Kernel k;
+
+    if(depth != CV_8U && depth != CV_16U && depth != CV_32F)
+        return false;
+
+    switch (code)
+    {
+    /*
+     case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
+     case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
+     case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
+     case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
+     case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
+     case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
+     */
+    case COLOR_BGR2GRAY:
+    case COLOR_BGRA2GRAY:
+    case COLOR_RGB2GRAY:
+    case COLOR_RGBA2GRAY:
+    {
+        CV_Assert(scn == 3 || scn == 4);
+        bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
+        dtype = depth;
+        k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=%d -D dcn=1 -D bidx=%d", depth, scn, bidx));
+        break;
+    }
+    case COLOR_GRAY2BGR:
+    case COLOR_GRAY2BGRA:
+    {
+        CV_Assert(scn == 1);
+        dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
+        dtype = CV_MAKETYPE(depth, dcn);
+        k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=1 -D dcn=%d", depth, dcn));
+        break;
+    }
+    case COLOR_BGR2YUV:
+    case COLOR_RGB2YUV:
+    {
+        CV_Assert(scn == 3 || scn == 4);
+        bidx = code == COLOR_RGB2YUV ? 0 : 2;
+        k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+        break;
+    }
+    case COLOR_YUV2BGR:
+    case COLOR_YUV2RGB:
+    {
+        if(dcn < 0) dcn = 3;
+        CV_Assert(dcn == 3 || dcn == 4);
+        bidx = code == COLOR_YUV2RGB ? 0 : 2;
+        k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=3 -D dcn=%d -D bidx=%d", depth, dcn, bidx));
+        break;
+    }
+    case COLOR_YUV2RGB_NV12:
+    case COLOR_YUV2BGR_NV12:
+    case COLOR_YUV2RGBA_NV12:
+    case COLOR_YUV2BGRA_NV12:
+    {
+        CV_Assert( scn == 1 );
+        CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
+        dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ? 4 : 3;
+        bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ? 0 : 2;
+
+        dstSz = Size(sz.width, sz.height * 2 / 3);
+        globalsize[0] = dstSz.height/2;
+        globalsize[1] = dstSz.width/2;
+        k.create("YUV2RGBA_NV12", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=0 -D scn=1 -D dcn=%d -D bidx=%d", dcn, bidx));
+        break;
+    }
+    case COLOR_BGR2YCrCb:
+    case COLOR_RGB2YCrCb:
+    {
+        CV_Assert(scn == 3 || scn == 4);
+        bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
+        k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
+                 format("-D depth=%d -D scn=%d -D dcn=3 -D bidx=%d", depth, scn, bidx));
+        break;
+    }
+    case COLOR_YCrCb2BGR:
+    case COLOR_YCrCb2RGB:
+        break;
+    /*
+     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
+     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
+     case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
+     case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
+     case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
+     case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
+     case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
+     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
+     case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
+     */
+    default:
+        ;
+    }
+
+    if( !k.empty() )
+    {
+        _dst.create(dstSz, dtype);
+        dst = _dst.getUMat();
+        k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
+        ok = k.run(2, globalsize, 0, false);
+    }
+    return ok;
+}
+
 }//namespace cv

 //////////////////////////////////////////////////////////////////////////////////////////
@ -2695,9 +2813,15 @@ struct mRGBA2RGBA

 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
 {
+    bool use_opencl = ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT;
+    int stype = _src.type();
+    int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
+
+    if( use_opencl && ocl_cvtColor(_src, _dst, code, dcn) )
+        return;
+
    Mat src = _src.getMat(), dst;
    Size sz = src.size();
-    int scn = src.channels(), depth = src.depth(), bidx;

    CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );

--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@ -1901,8 +1901,43 @@ private:
 };
 #endif

+static bool ocl_resize( InputArray _src, OutputArray _dst,
+                        double fx, double fy, int interpolation)
+{
+    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
+    if( !(cn <= 4 &&
+           (interpolation == INTER_NEAREST ||
+           (interpolation == INTER_LINEAR && (depth == CV_8U || depth == CV_32F)))) )
+        return false;
+    UMat src = _src.getUMat(), dst = _dst.getUMat();
+    ocl::Kernel k;
+
+    if (interpolation == INTER_LINEAR)
+    {
+        int wdepth = depth == CV_8U ? CV_32S : CV_32F;
+        int wtype = CV_MAKETYPE(wdepth, cn);
+        char buf[2][32];
+        k.create("resizeLN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_LINEAR -D depth=%s -D PIXTYPE=%s -D WORKTYPE=%s -D convertToWT=%s -D convertToDT=%s",
+                        depth, ocl::typeToStr(type), ocl::typeToStr(wtype),
+                        ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
+                        ocl::convertTypeStr(wdepth, depth, cn, buf[1])));
+    }
+    else if (interpolation == INTER_NEAREST)
+    {
+        k.create("resizeNN", ocl::imgproc::resize_oclsrc,
+                 format("-D INTER_NEAREST -D PIXTYPE=%s", ocl::memopTypeToStr(type) ));
+    }
+
+    if( k.empty() )
+        return false;
+    k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
+           (float)(1./fx), (float)(1./fy));
+    size_t globalsize[] = { dst.cols, dst.rows };
+    return k.run(2, globalsize, 0, false);
 }

+}

 //////////////////////////////////////////////////////////////////////////////////////////

@ -2013,25 +2048,28 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
        resizeArea_<double, double>, 0
    };

-    Mat src = _src.getMat();
-    Size ssize = src.size();
+    Size ssize = _src.size();

    CV_Assert( ssize.area() > 0 );
    CV_Assert( dsize.area() || (inv_scale_x > 0 && inv_scale_y > 0) );
    if( !dsize.area() )
    {
-        dsize = Size(saturate_cast<int>(src.cols*inv_scale_x),
-            saturate_cast<int>(src.rows*inv_scale_y));
+        dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
+                     saturate_cast<int>(ssize.height*inv_scale_y));
        CV_Assert( dsize.area() );
    }
    else
    {
-        inv_scale_x = (double)dsize.width/src.cols;
-        inv_scale_y = (double)dsize.height/src.rows;
+        inv_scale_x = (double)dsize.width/ssize.width;
+        inv_scale_y = (double)dsize.height/ssize.height;
    }
-    _dst.create(dsize, src.type());
-    Mat dst = _dst.getMat();
+    _dst.create(dsize, _src.type());

+    if( ocl::useOpenCL() && _dst.kind() == _InputArray::UMAT &&
+        ocl_resize(_src, _dst, inv_scale_x, inv_scale_y, interpolation) )
+        return;
+
+    Mat src = _src.getMat(), dst = _dst.getMat();

 #ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
--- a/modules/imgproc/src/opencl/bilateral.cl
+++ b/modules/imgproc/src/opencl/bilateral.cl
@ -0,0 +1,145 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Rock Li, Rock.li@amd.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+
+__kernel void bilateral_C1_D0(__global uchar *dst,
+        __global const uchar *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, x + radius);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        float sum = 0.f, wsum = 0.f;
+
+        int val0 = (int)src[src_index];
+        for(int k = 0; k < maxk; k++ )
+        {
+            int val = (int)src[src_index + space_ofs[k]];
+            float w = space_weight[k] * color_weight[abs(val - val0)];
+            sum += (float)(val) * w;
+            wsum += w;
+        }
+        dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f);
+    }
+}
+
+__kernel void bilateral2_C1_D0(__global uchar *dst,
+        __global const uchar *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int x = get_global_id(0) << 2;
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, x + radius);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        float4 sum = (float4)(0.f), wsum = (float4)(0.f);
+
+        int4 val0 = convert_int4(vload4(0,src + src_index));
+        for(int k = 0; k < maxk; k++ )
+        {
+            int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k]));
+            float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)],
+                color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]);
+            sum += convert_float4(val) * w;
+            wsum += w;
+        }
+        *(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f);
+    }
+}
+
+__kernel void bilateral_C4_D0(__global uchar4 *dst,
+        __global const uchar4 *src,
+        const int dst_rows,
+        const int dst_cols,
+        const int maxk,
+        const int radius,
+        const int dst_step,
+        const int dst_offset,
+        const int src_step,
+        const int src_rows,
+        const int src_cols,
+        __constant float *color_weight,
+        __constant float *space_weight,
+        __constant int *space_ofs)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < dst_rows && x < dst_cols)
+    {
+        int src_index = mad24(y + radius, src_step, x + radius);
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+        float4 sum = (float4)0.f;
+        float wsum = 0.f;
+
+        int4 val0 = convert_int4(src[src_index]);
+        for(int k = 0; k < maxk; k++ )
+        {
+            int4 val = convert_int4(src[src_index + space_ofs[k]]);
+            float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)];
+            sum += convert_float4(val) * (float4)w;
+            wsum += w;
+        }
+
+        wsum = 1.f / wsum;
+        dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f);
+    }
+}
--- a/modules/imgproc/src/opencl/boxfilter.cl
+++ b/modules/imgproc/src/opencl/boxfilter.cl
@ -0,0 +1,478 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+//blur function does not support BORDER_WRAP
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+#define THREADS 256
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+
+inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp,
+                             int dst_rows, int dst_cols,
+                             int dst_startX, int dst_x_off,
+                             float alpha)
+{
+    if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1))
+    {
+        return;
+    }
+
+    uint4 tmp_sum = 0;
+    int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4;
+    int posY = (get_group_id(1) << 1);
+
+    for(int i=-anX; i<=anX; i++)
+    {
+        tmp_sum += vload4(get_local_id(0), temp+i);
+    }
+
+    if(posY < dst_rows && posX < dst_cols)
+    {
+        tmp_sum /= (uint4) alpha;
+        if(posX >= 0 && posX < dst_cols)
+            *(dst) = tmp_sum.x;
+        if(posX+1 >= 0 && posX+1 < dst_cols)
+            *(dst + 1) = tmp_sum.y;
+        if(posX+2 >= 0 && posX+2 < dst_cols)
+            *(dst + 2) = tmp_sum.z;
+        if(posX+3 >= 0 && posX+3 < dst_cols)
+            *(dst + 3) = tmp_sum.w;
+    }
+}
+
+
+inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp,
+                             int dst_rows, int dst_cols,
+                             int dst_startX, int dst_x_off,
+                             float alpha)
+{
+    if(get_local_id(0) >= (THREADS-ksX+1))
+    {
+        return;
+    }
+
+    int posX = dst_startX - dst_x_off + get_local_id(0);
+    int posY = (get_group_id(1) << 1);
+
+    uint4 temp_sum = 0;
+    for(int i=-anX; i<=anX; i++)
+    {
+        temp_sum += temp[get_local_id(0) + anX + i];
+    }
+
+    if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows)
+        *dst = convert_uchar4(convert_float4(temp_sum)/alpha);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+    int src_x_off = src_offset % src_step;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = dst_offset % dst_step;
+    int dst_y_off = dst_offset / dst_step;
+
+    int head_off = dst_x_off%4;
+    int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    uint4 data[ksY+1];
+    __local uint4 temp[2][THREADS];
+
+#ifdef BORDER_CONSTANT
+
+    for(int i=0; i < ksY+1; i++)
+    {
+        if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols)
+        {
+            data[i].x = *(src+(startY+i)*src_step + startX + col * 4);
+            data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1);
+            data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2);
+            data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3);
+        }
+        else
+        {
+            data[i]=0;
+            int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols;
+            if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4);
+            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols;
+            if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ;
+            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols;
+            if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2);
+            con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols;
+            if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3);
+        }
+    }
+
+#else
+    int not_all_in_range;
+    for(int i=0; i < ksY+1; i++)
+    {
+        not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1)
+                           | (startY+i<0) | (startY+i>src_whole_rows-1);
+        if(not_all_in_range)
+        {
+            int selected_row;
+            int4 selected_col;
+            selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+            selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+            selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols);
+            selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x);
+
+            selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols);
+            selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y);
+
+            selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols);
+            selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z);
+
+            selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols);
+            selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w);
+
+            data[i].x = *(src + selected_row * src_step + selected_col.x);
+            data[i].y = *(src + selected_row * src_step + selected_col.y);
+            data[i].z = *(src + selected_row * src_step + selected_col.z);
+            data[i].w = *(src + selected_row * src_step + selected_col.w);
+        }
+        else
+        {
+            data[i] =  convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX)));
+        }
+    }
+#endif
+    uint4 tmp_sum = 0;
+    for(int i=1; i < ksY; i++)
+    {
+        tmp_sum += (data[i]);
+    }
+
+    int index = dst_startY * dst_step + dst_startX + (col-anX)*4;
+
+    temp[0][col] = tmp_sum + (data[0]);
+    temp[1][col] = tmp_sum + (data[ksY]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    update_dst_C1_D0(dst+index, (__local uint *)(temp[0]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+    update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 2;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    uint4 data[ksY+1];
+    __local uint4 temp[2][THREADS];
+
+#ifdef BORDER_CONSTANT
+    bool con;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+        int cur_col = clamp(startX + col, 0, src_whole_cols);
+
+        data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0;
+        data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0;
+        data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0;
+        data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0;
+    }
+#else
+    for(int i=0; i < ksY+1; i++)
+    {
+        int selected_row;
+        int selected_col;
+        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+
+
+        data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]);
+    }
+
+#endif
+    uint4 tmp_sum = 0;
+    for(int i=1; i < ksY; i++)
+    {
+        tmp_sum += (data[i]);
+    }
+
+    int index = dst_startY * (dst_step>>2)+ dst_startX + col;
+
+    temp[0][col] = tmp_sum + (data[0]);
+    temp[1][col] = tmp_sum + (data[ksY]);
+    barrier(CLK_LOCAL_MEM_FENCE);
+    update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+    update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]),
+                     dst_rows, dst_cols, dst_startX, dst_x_off, alpha);
+
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32fC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 2;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+    float data[ksY+1];
+    __local float temp[2][THREADS];
+#ifdef BORDER_CONSTANT
+    bool con;
+    float ss;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+
+        int cur_col = clamp(startX + col, 0, src_whole_cols);
+        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0;
+
+        data[i] = con ? ss : 0.f;
+    }
+#else
+    for(int i=0; i < ksY+1; i++)
+    {
+        int selected_row;
+        int selected_col;
+        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+
+        data[i] = src[selected_row * (src_step>>2) + selected_col];
+    }
+
+#endif
+    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+    temp[0][col] = sum1;
+    temp[1][col] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+
+        float tmp_sum[2]= {0.0, 0.0};
+        for(int k=0; k<2; k++)
+            for(int i=-anX; i<=anX; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+        for(int i=0; i<2; i++)
+        {
+            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
+                dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
+        }
+
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////32fC4////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha,
+                              int src_offset, int src_whole_rows, int src_whole_cols, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step
+                             )
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+
+    int src_x_off = (src_offset % src_step) >> 4;
+    int src_y_off = src_offset / src_step;
+    int dst_x_off = (dst_offset % dst_step) >> 4;
+    int dst_y_off = dst_offset / dst_step;
+
+    int startX = gX * (THREADS-ksX+1) - anX + src_x_off;
+    int startY = (gY << 1) - anY + src_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+    float4 data[ksY+1];
+    __local float4 temp[2][THREADS];
+#ifdef BORDER_CONSTANT
+    bool con;
+    float4 ss;
+    for(int i=0; i < ksY+1; i++)
+    {
+        con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows;
+
+        int cur_col = clamp(startX + col, 0, src_whole_cols);
+        ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0;
+
+        data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0);
+    }
+#else
+    for(int i=0; i < ksY+1; i++)
+    {
+        int selected_row;
+        int selected_col;
+        selected_row = ADDR_H(startY+i, 0, src_whole_rows);
+        selected_row = ADDR_B(startY+i, src_whole_rows, selected_row);
+
+        selected_col = ADDR_L(startX+col, 0, src_whole_cols);
+        selected_col = ADDR_R(startX+col, src_whole_cols, selected_col);
+
+        data[i] = src[selected_row * (src_step>>4) + selected_col];
+    }
+
+#endif
+    float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[i]);
+    }
+    sum1 = sum0 + (data[0]);
+    sum2 = sum0 + (data[ksY]);
+    temp[0][col] = sum1;
+    temp[1][col] = sum2;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gY << 1);
+
+        float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)};
+        for(int k=0; k<2; k++)
+            for(int i=-anX; i<=anX; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+        for(int i=0; i<2; i++)
+        {
+            if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows)
+                dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha;
+        }
+
+    }
+}
--- a/modules/imgproc/src/opencl/canny.cl
+++ b/modules/imgproc/src/opencl/canny.cl
@ -0,0 +1,636 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+#ifdef L2GRAD
+inline float calc(int x, int y)
+{
+    return sqrt((float)(x * x + y * y));
+}
+#else
+inline float calc(int x, int y)
+{
+    return (float)abs(x) + abs(y);
+}
+#endif //
+
+// Smoothing perpendicular to the derivative direction with a triangle filter
+// only support 3x3 Sobel kernel
+// h (-1) =  1, h (0) =  2, h (1) =  1
+// h'(-1) = -1, h'(0) =  0, h'(1) =  1
+// thus sobel 2D operator can be calculated as:
+// h'(x, y) = h'(x)h(y) for x direction
+//
+// src		input 8bit single channel image data
+// dx_buf	output dx buffer
+// dy_buf	output dy buffer
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+calcSobelRowPass
+(
+    __global const uchar * src,
+    __global int * dx_buf,
+    __global int * dy_buf,
+    int rows,
+    int cols,
+    int src_step,
+    int src_offset,
+    int dx_buf_step,
+    int dx_buf_offset,
+    int dy_buf_step,
+    int dy_buf_offset
+)
+{
+    dx_buf_step   /= sizeof(*dx_buf);
+    dx_buf_offset /= sizeof(*dx_buf);
+    dy_buf_step   /= sizeof(*dy_buf);
+    dy_buf_offset /= sizeof(*dy_buf);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local int smem[16][18];
+
+    smem[lidy][lidx + 1] =
+        src[gidx + min(gidy, rows - 1) * src_step + src_offset];
+    if(lidx == 0)
+    {
+        smem[lidy][0]  =
+            src[max(gidx - 1,  0)        + min(gidy, rows - 1) * src_step + src_offset];
+        smem[lidy][17] =
+            src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] =
+            -smem[lidy][lidx] + smem[lidy][lidx + 2];
+        dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] =
+            smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2];
+    }
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the buffered version(3x3 sobel)
+//
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+calcMagnitude_buf
+(
+    __global const int * dx_buf,
+    __global const int * dy_buf,
+    __global int * dx,
+    __global int * dy,
+    __global float * mag,
+    int rows,
+    int cols,
+    int dx_buf_step,
+    int dx_buf_offset,
+    int dy_buf_step,
+    int dy_buf_offset,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset
+)
+{
+    dx_buf_step    /= sizeof(*dx_buf);
+    dx_buf_offset  /= sizeof(*dx_buf);
+    dy_buf_step    /= sizeof(*dy_buf);
+    dy_buf_offset  /= sizeof(*dy_buf);
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    __local int sdx[18][16];
+    __local int sdy[18][16];
+
+    sdx[lidy + 1][lidx] =
+        dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset];
+    sdy[lidy + 1][lidx] =
+        dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset];
+    if(lidy == 0)
+    {
+        sdx[0][lidx]  =
+            dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset];
+        sdx[17][lidx] =
+            dx_buf[gidx + min(gidy + 16, rows - 1)  * dx_buf_step + dx_buf_offset];
+
+        sdy[0][lidx]  =
+            dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset];
+        sdy[17][lidx] =
+            dy_buf[gidx + min(gidy + 16, rows - 1)  * dy_buf_step + dy_buf_offset];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidx < cols && gidy < rows)
+    {
+        int x =  sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx];
+        int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx];
+
+        dx[gidx + gidy * dx_step + dx_offset] = x;
+        dy[gidx + gidy * dy_step + dy_offset] = y;
+
+        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y);
+    }
+}
+
+// calculate the magnitude of the filter pass combining both x and y directions
+// This is the non-buffered version(non-3x3 sobel)
+//
+// dx_buf		dx buffer, calculated from calcSobelRowPass
+// dy_buf		dy buffer, calculated from calcSobelRowPass
+// dx			direvitive in x direction output
+// dy			direvitive in y direction output
+// mag			magnitude direvitive of xy output
+__kernel
+void calcMagnitude
+(
+    __global const int * dx,
+    __global const int * dy,
+    __global float * mag,
+    int rows,
+    int cols,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset
+)
+{
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    if(gidy < rows && gidx < cols)
+    {
+        mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] =
+            calc(
+                dx[gidx + gidy * dx_step + dx_offset],
+                dy[gidx + gidy * dy_step + dy_offset]
+            );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// 0.4142135623730950488016887242097 is tan(22.5)
+#define CANNY_SHIFT 15
+#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)
+
+//First pass of edge detection and non-maximum suppression
+// edgetype is set to for each pixel:
+// 0 - below low thres, not an edge
+// 1 - maybe an edge
+// 2 - is an edge, either magnitude is greater than high thres, or
+//     Given estimates of the image gradients, a search is then carried out
+//     to determine if the gradient magnitude assumes a local maximum in the gradient direction.
+//     if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions,
+//     if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions,
+//     if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions,
+//     if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions.
+//
+// dx, dy		direvitives of x and y direction
+// mag			magnitudes calculated from calcMagnitude function
+// map			output containing raw edge types
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+calcMap
+(
+    __global const int * dx,
+    __global const int * dy,
+    __global const float * mag,
+    __global int * map,
+    int rows,
+    int cols,
+    float low_thresh,
+    float high_thresh,
+    int dx_step,
+    int dx_offset,
+    int dy_step,
+    int dy_offset,
+    int mag_step,
+    int mag_offset,
+    int map_step,
+    int map_offset
+)
+{
+    dx_step    /= sizeof(*dx);
+    dx_offset  /= sizeof(*dx);
+    dy_step    /= sizeof(*dy);
+    dy_offset  /= sizeof(*dy);
+    mag_step   /= sizeof(*mag);
+    mag_offset /= sizeof(*mag);
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    mag += mag_offset;
+    map += map_offset;
+
+    __local float smem[18][18];
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_global_id(0) & 0xFFFFF0;
+    int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+    int tid = lidx + lidy * 16;
+    int lx = tid % 18;
+    int ly = tid / 18;
+    if(ly < 14)
+    {
+        smem[ly][lx] =
+            mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step];
+    }
+    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+    {
+        smem[ly + 14][lx] =
+            mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int x = dx[gidx + gidy * dx_step];
+        int y = dy[gidx + gidy * dy_step];
+        const int s = (x ^ y) < 0 ? -1 : 1;
+        const float m = smem[lidy + 1][lidx + 1];
+        x = abs(x);
+        y = abs(y);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+        if(m > low_thresh)
+        {
+            const int tg22x = x * TG22;
+            const int tg67x = tg22x + (x << (1 + CANNY_SHIFT));
+            y <<= CANNY_SHIFT;
+            if(y < tg22x)
+            {
+                if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else if (y > tg67x)
+            {
+                if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+            else
+            {
+                if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s])
+                {
+                    edge_type = 1 + (int)(m > high_thresh);
+                }
+            }
+        }
+        map[gidx + 1 + (gidy + 1) * map_step] = edge_type;
+    }
+}
+
+#undef CANNY_SHIFT
+#undef TG22
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// do Hysteresis for pixel whose edge type is 1
+//
+// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and
+// marked as edge. Each thread will iterate for 16 times to connect local edges.
+// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will
+// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel.
+//
+// map		raw edge type results calculated from calcMap.
+// st		the potiential edge points found in this kernel call
+// counter	the number of potiential edge points
+__kernel
+void
+__attribute__((reqd_work_group_size(16,16,1)))
+edgesHysteresisLocal
+(
+    __global int * map,
+    __global ushort2 * st,
+    __global unsigned int * counter,
+    int rows,
+    int cols,
+    int map_step,
+    int map_offset
+)
+{
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    map += map_offset;
+
+    __local int smem[18][18];
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_global_id(0) & 0xFFFFF0;
+    int grp_idy = get_global_id(1) & 0xFFFFF0;
+
+    int tid = lidx + lidy * 16;
+    int lx = tid % 18;
+    int ly = tid / 18;
+    if(ly < 14)
+    {
+        smem[ly][lx] =
+            map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step];
+    }
+    if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols)
+    {
+        smem[ly + 14][lx] =
+            map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(gidy < rows && gidx < cols)
+    {
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[lidy + 1][lidx + 1] == 1)
+            {
+                n += smem[lidy    ][lidx    ] == 2;
+                n += smem[lidy    ][lidx + 1] == 2;
+                n += smem[lidy    ][lidx + 2] == 2;
+
+                n += smem[lidy + 1][lidx    ] == 2;
+                n += smem[lidy + 1][lidx + 2] == 2;
+
+                n += smem[lidy + 2][lidx    ] == 2;
+                n += smem[lidy + 2][lidx + 1] == 2;
+                n += smem[lidy + 2][lidx + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[lidy + 1][lidx + 1] = 2;
+        }
+
+        const int e = smem[lidy + 1][lidx + 1];
+        map[gidx + 1 + (gidy + 1) * map_step] = e;
+
+        n = 0;
+        if(e == 2)
+        {
+            n += smem[lidy    ][lidx    ] == 1;
+            n += smem[lidy    ][lidx + 1] == 1;
+            n += smem[lidy    ][lidx + 2] == 1;
+
+            n += smem[lidy + 1][lidx    ] == 1;
+            n += smem[lidy + 1][lidx + 2] == 1;
+
+            n += smem[lidy + 2][lidx    ] == 1;
+            n += smem[lidy + 2][lidx + 1] == 1;
+            n += smem[lidy + 2][lidx + 2] == 1;
+        }
+
+        if(n > 0)
+        {
+            unsigned int ind = atomic_inc(counter);
+            st[ind] = (ushort2)(gidx + 1, gidy + 1);
+        }
+    }
+}
+
+__constant int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+__constant int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+
+#define stack_size 512
+__kernel
+void
+__attribute__((reqd_work_group_size(128,1,1)))
+edgesHysteresisGlobal
+(
+    __global int * map,
+    __global ushort2 * st1,
+    __global ushort2 * st2,
+    __global int * counter,
+    int rows,
+    int cols,
+    int count,
+    int map_step,
+    int map_offset
+)
+{
+
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    map += map_offset;
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    int lidx = get_local_id(0);
+    int lidy = get_local_id(1);
+
+    int grp_idx = get_group_id(0);
+    int grp_idy = get_group_id(1);
+
+    __local unsigned int s_counter;
+    __local unsigned int s_ind;
+
+    __local ushort2 s_st[stack_size];
+
+    if(lidx == 0)
+    {
+        s_counter = 0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx);
+
+    if(ind < count)
+    {
+        ushort2 pos = st1[ind];
+        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+        {
+            if (lidx < 8)
+            {
+                pos.x += c_dx[lidx];
+                pos.y += c_dy[lidx];
+
+                if (map[pos.x + pos.y * map_step] == 1)
+                {
+                    map[pos.x + pos.y * map_step] = 2;
+
+                    ind = atomic_inc(&s_counter);
+
+                    s_st[ind] = pos;
+                }
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            while (s_counter > 0 && s_counter <= stack_size - get_local_size(0))
+            {
+                const int subTaskIdx = lidx >> 3;
+                const int portion = min(s_counter, (uint)(get_local_size(0)>> 3));
+
+                pos.x = pos.y = 0;
+
+                if (subTaskIdx < portion)
+                    pos = s_st[s_counter - 1 - subTaskIdx];
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (lidx == 0)
+                    s_counter -= portion;
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                {
+                    pos.x += c_dx[lidx & 7];
+                    pos.y += c_dy[lidx & 7];
+
+                    if (map[pos.x + pos.y * map_step] == 1)
+                    {
+                        map[pos.x + pos.y * map_step] = 2;
+
+                        ind = atomic_inc(&s_counter);
+
+                        s_st[ind] = pos;
+                    }
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+
+            if (s_counter > 0)
+            {
+                if (lidx == 0)
+                {
+                    ind = atomic_add(counter, s_counter);
+                    s_ind = ind - s_counter;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                ind = s_ind;
+
+                for (int i = lidx; i < s_counter; i += get_local_size(0))
+                {
+                    st2[ind + i] = s_st[i];
+                }
+            }
+        }
+    }
+}
+#undef stack_size
+
+//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0.
+// map		edge type mappings
+// dst		edge output
+__kernel
+void getEdges
+(
+    __global const int * map,
+    __global uchar * dst,
+    int rows,
+    int cols,
+    int map_step,
+    int map_offset,
+    int dst_step,
+    int dst_offset
+)
+{
+    map_step   /= sizeof(*map);
+    map_offset /= sizeof(*map);
+
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    if(gidy < rows && gidx < cols)
+    {
+        dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1));
+    }
+}
--- a/modules/imgproc/src/opencl/clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
@ -0,0 +1,255 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+
+int calc_lut(__local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid == 0)
+        for (int i = 1; i < 256; ++i)
+            smem[i] += smem[i - 1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    return smem[tid];
+}
+
+#ifdef CPU
+void reduce(volatile __local int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+        smem[tid] = val += smem[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+        smem[tid] = val += smem[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+        smem[tid] += smem[tid + 32];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+        smem[tid] += smem[tid + 16];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+        smem[tid] += smem[tid + 8];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 4)
+        smem[tid] += smem[tid + 4];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 2)
+        smem[tid] += smem[tid + 2];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 1)
+        smem[256] = smem[tid] + smem[tid + 1];
+    barrier(CLK_LOCAL_MEM_FENCE);
+}
+
+#else
+
+void reduce(__local volatile int* smem, int val, int tid)
+{
+    smem[tid] = val;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 128)
+        smem[tid] = val += smem[tid + 128];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 64)
+        smem[tid] = val += smem[tid + 64];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    } barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 16)
+    {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (tid < 8)
+    {
+#endif
+        smem[tid] += smem[tid + 8];
+        smem[tid] += smem[tid + 4];
+        smem[tid] += smem[tid + 2];
+        smem[tid] += smem[tid + 1];
+    }
+}
+#endif
+
+__kernel void calcLut(__global __const uchar * src, __global uchar * lut,
+                      const int srcStep, const int dstStep,
+                      const int2 tileSize, const int tilesX,
+                      const int clipLimit, const float lutScale,
+                      const int src_offset, const int dst_offset)
+{
+    __local int smem[512];
+
+    const int tx = get_group_id(0);
+    const int ty = get_group_id(1);
+    const unsigned int tid = get_local_id(1) * get_local_size(0)
+                             + get_local_id(0);
+
+    smem[tid] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
+    {
+        __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
+        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
+        {
+            const int data = srcPtr[j];
+            atomic_inc(&smem[data]);
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int tHistVal = smem[tid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (clipLimit > 0)
+    {
+        // clip histogram bar
+        int clipped = 0;
+        if (tHistVal > clipLimit)
+        {
+            clipped = tHistVal - clipLimit;
+            tHistVal = clipLimit;
+        }
+
+        // find number of overall clipped samples
+        reduce(smem, clipped, tid);
+        barrier(CLK_LOCAL_MEM_FENCE);
+#ifdef CPU
+        clipped = smem[256];
+#else
+        clipped = smem[0];
+#endif
+
+        // broadcast evaluated value
+
+        __local int totalClipped;
+
+        if (tid == 0)
+            totalClipped = clipped;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // redistribute clipped samples evenly
+
+        int redistBatch = totalClipped / 256;
+        tHistVal += redistBatch;
+
+        int residual = totalClipped - redistBatch * 256;
+        if (tid < residual)
+            ++tHistVal;
+    }
+
+    const int lutVal = calc_lut(smem, tHistVal, tid);
+    uint ires = (uint)convert_int_rte(lutScale * lutVal);
+    lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
+        convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
+
+__kernel void transform(__global __const uchar * src,
+                        __global uchar * dst,
+                        __global uchar * lut,
+                        const int srcStep, const int dstStep, const int lutStep,
+                        const int cols, const int rows,
+                        const int2 tileSize,
+                        const int tilesX, const int tilesY,
+                        const int src_offset, const int dst_offset, int lut_offset)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x >= cols || y >= rows)
+        return;
+
+    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
+    int ty1 = convert_int_rtn(tyf);
+    int ty2 = ty1 + 1;
+    const float ya = tyf - ty1;
+    ty1 = max(ty1, 0);
+    ty2 = min(ty2, tilesY - 1);
+
+    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
+    int tx1 = convert_int_rtn(txf);
+    int tx2 = tx1 + 1;
+    const float xa = txf - tx1;
+    tx1 = max(tx1, 0);
+    tx2 = min(tx2, tilesX - 1);
+
+    const int srcVal = src[mad24(y, srcStep, x + src_offset)];
+
+    float res = 0;
+
+    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
+    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
+    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
+    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
+
+    uint ires = (uint)convert_int_rte(res);
+    dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
+}
--- a/modules/imgproc/src/opencl/convolve.cl
+++ b/modules/imgproc/src/opencl/convolve.cl
@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jiang Liyuan, jlyuan001.good@163.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (__ATI__)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (__NVIDIA__)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+/************************************** convolve **************************************/
+
+__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst,
+                          int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight,
+                          int src_offset, int dst_offset, int koffset)
+{
+    __local float smem[16 + 2 * 8][16 + 2 * 8];
+
+    int x = get_local_id(0);
+    int y = get_local_id(1);
+    int gx = get_global_id(0);
+    int gy = get_global_id(1);
+
+            // x | x 0 | 0
+            // -----------
+            // x | x 0 | 0
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+    smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
+
+            // 0 | 0 x | x
+            // -----------
+            // 0 | 0 x | x
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+    smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
+
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            // x | x 0 | 0
+            // -----------
+            // x | x 0 | 0
+    smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset];
+
+            // 0 | 0 0 | 0
+            // -----------
+            // 0 | 0 0 | 0
+            // 0 | 0 x | x
+            // -----------
+            // 0 | 0 x | x
+    smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (gx < cols && gy < rows)
+    {
+        float res = 0;
+
+        for (int i = 0; i < kHeight; ++i)
+            for (int j = 0; j < kWidth; ++j)
+                res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset];
+
+        dst[gy * dst_step + gx + dst_offset] = res;
+    }
+}
--- a/modules/imgproc/src/opencl/copymakeborder.cl
+++ b/modules/imgproc/src/opencl/copymakeborder.cl
@ -0,0 +1,134 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin zero.lin@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_amd_fp64
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#elif defined (cl_khr_fp64)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+#endif
+
+#ifdef BORDER_CONSTANT
+#define EXTRAPOLATE(x, y, v) v = scalar;
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, y, v) \
+    { \
+        x = max(min(x, src_cols - 1), 0); \
+        y = max(min(y, src_rows - 1), 0); \
+        v = src[mad24(y, src_step, x + src_offset)]; \
+    }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, y, v) \
+    { \
+        if (x < 0) \
+            x -= ((x - src_cols + 1) / src_cols) * src_cols; \
+        if (x >= src_cols) \
+            x %= src_cols; \
+        \
+        if (y < 0) \
+            y -= ((y - src_rows + 1) / src_rows) * src_rows; \
+        if( y >= src_rows ) \
+            y %= src_rows; \
+        v = src[mad24(y, src_step, x + src_offset)]; \
+    }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
+#ifdef BORDER_REFLECT
+#define DELTA int delta = 0
+#else
+#define DELTA int delta = 1
+#endif
+#define EXTRAPOLATE(x, y, v) \
+    { \
+        DELTA; \
+        if (src_cols == 1) \
+            x = 0; \
+        else \
+            do \
+            { \
+                if( x < 0 ) \
+                    x = -x - 1 + delta; \
+                else \
+                    x = src_cols - 1 - (x - src_cols) - delta; \
+            } \
+            while (x >= src_cols || x < 0); \
+        \
+        if (src_rows == 1) \
+            y = 0; \
+        else \
+            do \
+            { \
+                if( y < 0 ) \
+                    y = -y - 1 + delta; \
+                else \
+                    y = src_rows - 1 - (y - src_rows) - delta; \
+            } \
+            while (y >= src_rows || y < 0); \
+        v = src[mad24(y, src_step, x + src_offset)]; \
+    }
+#else
+#error No extrapolation method
+#endif
+
+#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
+
+__kernel void copymakeborder
+                        (__global const GENTYPE *src,
+                         __global GENTYPE *dst,
+                         int dst_cols, int dst_rows,
+                         int src_cols, int src_rows,
+                         int src_step, int src_offset,
+                         int dst_step, int dst_offset,
+                         int top, int left, GENTYPE scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int src_x = x - left;
+        int src_y = y - top;
+        int dst_index = mad24(y, dst_step, x + dst_offset);
+
+        if (NEED_EXTRAPOLATION(src_x, src_y))
+            EXTRAPOLATE(src_x, src_y, dst[dst_index])
+        else
+        {
+            int src_index = mad24(src_y, src_step, src_x + src_offset);
+            dst[dst_index] = src[src_index];
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/cvtcolor.cl
+++ b/modules/imgproc/src/opencl/cvtcolor.cl
@ -0,0 +1,306 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/**************************************PUBLICFUNC*************************************/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+#if depth == 0
+    #define DATA_TYPE uchar
+    #define MAX_NUM  255
+    #define HALF_MAX 128
+    #define SAT_CAST(num) convert_uchar_sat(num)
+    #define DEPTH_0
+#elif depth == 2
+    #define DATA_TYPE ushort
+    #define MAX_NUM  65535
+    #define HALF_MAX 32768
+    #define SAT_CAST(num) convert_ushort_sat(num)
+    #define DEPTH_2
+#elif depth == 5
+    #define DATA_TYPE float
+    #define MAX_NUM  1.0f
+    #define HALF_MAX 0.5f
+    #define SAT_CAST(num) (num)
+    #define DEPTH_5
+#else
+    #error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)"
+#endif
+
+#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
+
+enum
+{
+    yuv_shift  = 14,
+    xyz_shift  = 12,
+    R2Y        = 4899,
+    G2Y        = 9617,
+    B2Y        = 1868,
+    BLOCK_SIZE = 256
+};
+
+#define scnbytes ((int)sizeof(DATA_TYPE)*scn)
+#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn)
+
+///////////////////////////////////// RGB <-> GRAY //////////////////////////////////////
+
+__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset,
+                       __global uchar* dstptr, int dststep, int dstoffset,
+                       int rows, int cols)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+        DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+#if defined (DEPTH_5)
+        dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f;
+#else
+        dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift);
+#endif
+    }
+}
+
+__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
+                       __global uchar* dstptr, int dststep, int dstoffset,
+                       int rows, int cols)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+        DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+        DATA_TYPE val = src[0];
+        dst[0] = dst[1] = dst[2] = val;
+#if dcn == 4
+        dst[3] = MAX_NUM;
+#endif
+    }
+}
+
+///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
+
+__constant float c_RGB2YUVCoeffs_f[5]  = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
+__constant int   c_RGB2YUVCoeffs_i[5]  = { B2Y, G2Y, R2Y, 8061, 14369 };
+
+__kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset,
+                      __global uchar* dstptr, int dststep, int dstoffset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+        DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+        DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
+
+#if defined (DEPTH_5)
+        __constant float * coeffs = c_RGB2YUVCoeffs_f;
+        const DATA_TYPE Y  = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
+        const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX;
+        const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX;
+#else
+        __constant int * coeffs = c_RGB2YUVCoeffs_i;
+        const int delta = HALF_MAX * (1 << yuv_shift);
+        const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
+        const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift);
+        const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift);
+#endif
+
+        dst[0] = SAT_CAST( Y );
+        dst[1] = SAT_CAST( U );
+        dst[2] = SAT_CAST( V );
+    }
+}
+
+__constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f };
+__constant int   c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 };
+
+__kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset,
+                      __global uchar* dstptr, int dststep, int dstoffset,
+                      int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+        DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+        DATA_TYPE Y = src[0], U = src[1], V = src[2];
+
+#if defined (DEPTH_5)
+        __constant float * coeffs = c_YUV2RGBCoeffs_f;
+        const float r = Y + (V - HALF_MAX) * coeffs[3];
+        const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1];
+        const float b = Y + (U - HALF_MAX) * coeffs[0];
+#else
+        __constant int * coeffs = c_YUV2RGBCoeffs_i;
+        const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift);
+        const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift);
+        const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift);
+#endif
+
+        dst[bidx] = SAT_CAST( b );
+        dst[1] = SAT_CAST( g );
+        dst[bidx^2] = SAT_CAST( r );
+#if dcn == 4
+        dst[3] = MAX_NUM;
+#endif
+    }
+}
+
+__constant int ITUR_BT_601_CY = 1220542;
+__constant int ITUR_BT_601_CUB = 2116026;
+__constant int ITUR_BT_601_CUG = 409993;
+__constant int ITUR_BT_601_CVG = 852492;
+__constant int ITUR_BT_601_CVR = 1673527;
+__constant int ITUR_BT_601_SHIFT = 20;
+
+__kernel void YUV2RGBA_NV12(__global const uchar* srcptr, int srcstep, int srcoffset,
+                            __global uchar* dstptr, int dststep, int dstoffset,
+                            int rows, int cols)
+{
+    const int x = get_global_id(0); // max_x = width / 2
+    const int y = get_global_id(1); // max_y = height/ 2
+
+    if (y < rows / 2 && x < cols / 2 )
+    {
+        __global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset);
+        __global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset);
+        __global uchar*       dst1 = dstptr + mad24(y << 1, dststep, x*(dcn*2) + dstoffset);
+        __global uchar*       dst2 = dstptr + mad24((y << 1) + 1, dststep, x*(dcn*2) + dstoffset);
+
+        int Y1 = ysrc[0];
+        int Y2 = ysrc[1];
+        int Y3 = ysrc[srcstep];
+        int Y4 = ysrc[srcstep + 1];
+
+        int U  = usrc[0] - 128;
+        int V  = usrc[1] - 128;
+
+        int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V;
+        int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U;
+        int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U;
+
+        Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY;
+        dst1[2 - bidx]     = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT);
+        dst1[1]        = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT);
+        dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT);
+#if dcn == 4
+        dst1[3]        = 255;
+#endif
+
+        Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY;
+        dst1[(dcn + 2) - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT);
+        dst1[dcn + 1]        = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT);
+        dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT);
+#if dcn == 4
+        dst1[7]        = 255;
+#endif
+
+        Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY;
+        dst2[2 - bidx]     = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT);
+        dst2[1]        = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT);
+        dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT);
+#if dcn == 4
+        dst2[3]        = 255;
+#endif
+
+        Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY;
+        dst2[(dcn + 2) - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT);
+        dst2[dcn + 1]        = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT);
+        dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT);
+#if dcn == 4
+        dst2[7]        = 255;
+#endif
+    }
+}
+
+///////////////////////////////////// RGB <-> YUV //////////////////////////////////////
+
+__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
+__constant int   c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241};
+
+__kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset,
+                        __global uchar* dstptr, int dststep, int dstoffset,
+                        int rows, int cols)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (y < rows && x < cols)
+    {
+        const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes));
+        DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes));
+        DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2];
+
+#if defined (DEPTH_5)
+        __constant float * coeffs = c_RGB2YCrCbCoeffs_f;
+        const DATA_TYPE Y  = b * coeffs[0] + g * coeffs[1] + r * coeffs[2];
+        const DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX;
+        const DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX;
+#else
+        __constant int * coeffs = c_RGB2YCrCbCoeffs_i;
+        const int delta = HALF_MAX * (1 << yuv_shift);
+        const int Y =  CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift);
+        const int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift);
+        const int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift);
+#endif
+
+        dst[0] = SAT_CAST( Y );
+        dst[1] = SAT_CAST( Cr );
+        dst[2] = SAT_CAST( Cb );
+    }
+}
--- a/modules/imgproc/src/opencl/gftt.cl
+++ b/modules/imgproc/src/opencl/gftt.cl
@ -0,0 +1,275 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef WITH_MASK
+#define WITH_MASK 0
+#endif
+
+__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
+{
+    return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
+}
+
+inline float ELEM_FLT2(image2d_t _eig, float2 pt)
+{
+    return read_imagef(_eig, sampler, pt).x;
+}
+
+__kernel
+    void findCorners
+    (
+        image2d_t eig,
+        __global const char * mask,
+        __global float2 * corners,
+        const int mask_strip,// in pixels
+        const float threshold,
+        const int rows,
+        const int cols,
+        const int max_count,
+        __global int * g_counter
+    )
+{
+    const int j = get_global_id(0);
+    const int i = get_global_id(1);
+
+    if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1
+#if WITH_MASK
+        && mask[i * mask_strip + j] != 0
+#endif
+        )
+    {
+        const float val = ELEM_INT2(eig, j, i);
+
+        if (val > threshold)
+        {
+            float maxVal = val;
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i - 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
+
+            maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j    , i + 1), maxVal);
+            maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
+
+            if (val == maxVal)
+            {
+                const int ind = atomic_inc(g_counter);
+
+                if (ind < max_count)
+                    corners[ind] = (float2)(j, i);
+            }
+        }
+    }
+}
+
+//bitonic sort
+__kernel
+    void sortCorners_bitonicSort
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        const int stage,
+        const int passOfStage
+    )
+{
+    const int threadId = get_global_id(0);
+    if(threadId >= count / 2)
+    {
+        return;
+    }
+
+    const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent
+
+    const int pairDistance = 1 << (stage - passOfStage);
+    const int blockWidth   = 2 * pairDistance;
+
+    const int leftId = min( (threadId % pairDistance)
+                   + (threadId / pairDistance) * blockWidth, count );
+
+    const int rightId = min( leftId + pairDistance, count );
+
+    const float2 leftPt  = corners[leftId];
+    const float2 rightPt = corners[rightId];
+
+    const float leftVal  = ELEM_FLT2(eig, leftPt);
+    const float rightVal = ELEM_FLT2(eig, rightPt);
+
+    const bool compareResult = leftVal > rightVal;
+
+    float2 greater = compareResult ? leftPt:rightPt;
+    float2 lesser  = compareResult ? rightPt:leftPt;
+
+    corners[leftId]  = sortOrder ? lesser : greater;
+    corners[rightId] = sortOrder ? greater : lesser;
+}
+
+//selection sort for gfft
+//kernel is ported from Bolt library:
+//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
+//  Local sort will firstly sort elements of each workgroup using selection sort
+//  its performance is O(n)
+__kernel
+    void sortCorners_selectionSortLocal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count,
+        __local float2 * scratch
+    )
+{
+    int          i  = get_local_id(0); // index in workgroup
+    int numOfGroups = get_num_groups(0); // index in workgroup
+    int groupID     = get_group_id(0);
+    int         wg  = get_local_size(0); // workgroup size = block size
+    int n; // number of elements to be processed for this work group
+
+    int offset   = groupID * wg;
+    int same     = 0;
+    corners      += offset;
+    n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
+    float2 pt1, pt2;
+
+    pt1 = corners[min(i, n)];
+    scratch[i] = pt1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(i >= n)
+    {
+        return;
+    }
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    int pos = 0;
+    for (int j=0;j<n;++j)
+    {
+        pt2  = scratch[j];
+        val2 = ELEM_FLT2(eig, pt2);
+        if(val2 > val1)
+            pos++;//calculate the rank of this element in this work group
+        else
+        {
+            if(val1 > val2)
+                continue;
+            else
+            {
+                // val1 and val2 are same
+                same++;
+            }
+        }
+    }
+    for (int j=0; j< same; j++)
+        corners[pos + j] = pt1;
+}
+__kernel
+    void sortCorners_selectionSortFinal
+    (
+        image2d_t eig,
+        __global float2 * corners,
+        const int count
+    )
+{
+    const int          i  = get_local_id(0); // index in workgroup
+    const int numOfGroups = get_num_groups(0); // index in workgroup
+    const int groupID     = get_group_id(0);
+    const int         wg  = get_local_size(0); // workgroup size = block size
+    int pos = 0, same = 0;
+    const int offset = get_group_id(0) * wg;
+    const int remainder = count - wg*(numOfGroups-1);
+
+    if((offset + i ) >= count)
+        return;
+    float2 pt1, pt2;
+    pt1 = corners[groupID*wg + i];
+
+    float val1 = ELEM_FLT2(eig, pt1);
+    float val2;
+
+    for(int j=0; j<numOfGroups-1; j++ )
+    {
+        for(int k=0; k<wg; k++)
+        {
+            pt2  = corners[j*wg + k];
+            val2 = ELEM_FLT2(eig, pt2);
+            if(val1 > val2)
+                break;
+            else
+            {
+                //Increment only if the value is not the same.
+                if( val2 > val1 )
+                    pos++;
+                else
+                    same++;
+            }
+        }
+    }
+
+    for(int k=0; k<remainder; k++)
+    {
+        pt2  = corners[(numOfGroups-1)*wg + k];
+        val2 = ELEM_FLT2(eig, pt2);
+        if(val1 > val2)
+            break;
+        else
+        {
+            //Don't increment if the value is the same.
+            //Two elements are same if (*userComp)(jData, iData)  and (*userComp)(iData, jData) are both false
+            if(val2 > val1)
+                pos++;
+            else
+                same++;
+        }
+    }
+    for (int j=0; j< same; j++)
+        corners[pos + j] = pt1;
+}
--- a/modules/imgproc/src/opencl/harris.cl
+++ b/modules/imgproc/src/opencl/harris.cl
@ -0,0 +1,202 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT101
+//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+#define THREADS 256
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////calcHarris////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst,
+                              int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
+                              int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              float k)
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+    const int glx = get_global_id(0);
+    const int gly = get_global_id(1);
+
+    int dx_x_off = (dx_offset % dx_step) >> 2;
+    int dx_y_off = dx_offset / dx_step;
+    int dy_x_off = (dy_offset % dy_step) >> 2;
+    int dy_y_off = dy_offset / dy_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
+    int dx_startY = (gY << 1) - anY + dx_y_off;
+    int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
+    int dy_startY = (gY << 1) - anY + dy_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+    __local float temp[6][THREADS];
+#ifdef BORDER_CONSTANT
+    bool dx_con,dy_con;
+    float dx_s,dy_s;
+    for(int i=0; i < ksY+1; i++)
+    {
+        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
+        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
+        dx_data[i] = dx_con ? dx_s : 0.0;
+        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
+        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
+        dy_data[i] = dy_con ? dy_s : 0.0;
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#else
+    int clamped_col = min(dst_cols, col);
+    for(int i=0; i < ksY+1; i++)
+    {
+        int dx_selected_row;
+        int dx_selected_col;
+        dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
+        dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
+        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
+        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
+
+        int dy_selected_row;
+        int dy_selected_col;
+        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
+        dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
+        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
+        dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
+
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#endif
+    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[0][i]);
+        sum1 += (data[1][i]);
+        sum2 += (data[2][i]);
+    }
+    float sum01,sum02,sum11,sum12,sum21,sum22;
+    sum01 = sum0 + (data[0][0]);
+    sum02 = sum0 + (data[0][ksY]);
+    temp[0][col] = sum01;
+    temp[1][col] = sum02;
+    sum11 = sum1 + (data[1][0]);
+    sum12 = sum1 + (data[1][ksY]);
+    temp[2][col] = sum11;
+    temp[3][col] = sum12;
+    sum21 = sum2 + (data[2][0]);
+    sum22 = sum2 + (data[2][ksY]);
+    temp[4][col] = sum21;
+    temp[5][col] = sum22;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gly << 1);
+        int till = (ksX + 1)%2;
+        float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
+        for(int k=0; k<6; k++)
+            for(int i=-anX; i<=anX - till; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+
+        if(posX < dst_cols && (posY) < dst_rows)
+        {
+            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
+                    tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
+        }
+        if(posX < dst_cols && (posY + 1) < dst_rows)
+        {
+            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
+                    tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/histogram.cl
+++ b/modules/imgproc/src/opencl/histogram.cl
@ -0,0 +1,279 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Xu Pang, pangxu010@163.com
+//    Wenju He, wenju@multicorewareinc.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+#define PARTIAL_HISTOGRAM256_COUNT     (256)
+#define HISTOGRAM256_BIN_COUNT         (256)
+
+#define HISTOGRAM256_WORK_GROUP_SIZE     (256)
+#define HISTOGRAM256_LOCAL_MEM_SIZE      (HISTOGRAM256_BIN_COUNT)
+
+#define NBANKS (16)
+#define NBANKS_BIT (4)
+
+
+__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0(
+                                                                      __global const uint4* src,
+                                          int src_step, int src_offset,
+                                                                      __global int* globalHist,
+                                                                      int dataCount,  int cols,
+                                          int inc_x, int inc_y,
+                                          int hist_step)
+{
+        __local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS
+        int gid = get_global_id(0);
+        int lid = get_local_id(0);
+        int gx  = get_group_id(0);
+        int gsize = get_global_size(0);
+        int lsize  = get_local_size(0);
+        const int shift = 8;
+        const int mask = HISTOGRAM256_BIN_COUNT-1;
+        int offset = (lid & (NBANKS-1));// lid % NBANKS
+        uint4 data, temp1, temp2, temp3, temp4;
+        src += src_offset;
+
+        //clear LDS
+        for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize)
+        {
+            subhist[idx] = 0;
+            subhist[idx+=lsize] = 0;
+            subhist[idx+=lsize] = 0;
+            subhist[idx+=lsize] = 0;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        //read and scatter
+        int y = gid/cols;
+        int x = gid - mul24(y, cols);
+        for(int idx=gid; idx<dataCount; idx+=gsize)
+        {
+              data = src[mad24(y, src_step, x)];
+              temp1 = ((data & mask) << NBANKS_BIT) + offset;
+              data >>= shift;
+              temp2 = ((data & mask) << NBANKS_BIT) + offset;
+              data >>= shift;
+              temp3 = ((data & mask) << NBANKS_BIT) + offset;
+              data >>= shift;
+              temp4 = ((data & mask) << NBANKS_BIT) + offset;
+
+              atomic_inc(subhist + temp1.x);
+              atomic_inc(subhist + temp1.y);
+              atomic_inc(subhist + temp1.z);
+              atomic_inc(subhist + temp1.w);
+
+              atomic_inc(subhist + temp2.x);
+              atomic_inc(subhist + temp2.y);
+              atomic_inc(subhist + temp2.z);
+              atomic_inc(subhist + temp2.w);
+
+              atomic_inc(subhist + temp3.x);
+              atomic_inc(subhist + temp3.y);
+              atomic_inc(subhist + temp3.z);
+              atomic_inc(subhist + temp3.w);
+
+              atomic_inc(subhist + temp4.x);
+              atomic_inc(subhist + temp4.y);
+              atomic_inc(subhist + temp4.z);
+              atomic_inc(subhist + temp4.w);
+
+              x += inc_x;
+              int off = ((x>=cols) ? -1 : 0);
+              x = mad24(off, cols, x);
+              y += inc_y - off;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        //reduce local banks to single histogram per workgroup
+        int bin1=0, bin2=0, bin3=0, bin4=0;
+        for(int i=0; i<NBANKS; i+=4)
+        {
+             bin1 += subhist[(lid << NBANKS_BIT) + i];
+             bin2 += subhist[(lid << NBANKS_BIT) + i+1];
+             bin3 += subhist[(lid << NBANKS_BIT) + i+2];
+             bin4 += subhist[(lid << NBANKS_BIT) + i+3];
+        }
+
+        globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4;
+}
+
+__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1)))
+calc_sub_hist_border_D0(__global const uchar* src, int src_step, int src_offset,
+                        __global int* globalHist, int left_col, int cols,
+                        int rows, int hist_step)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+        int lidy = get_local_id(1);
+        int gx = get_group_id(0);
+        int gy = get_group_id(1);
+        int gn = get_num_groups(0);
+        int rowIndex = mad24(gy, gn, gx);
+//        rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1);
+
+        __local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE];
+        subhist[lidy] = 0;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        gidx = ((gidx>=left_col) ? (gidx+cols) : gidx);
+        if(gidy<rows)
+        {
+            int src_index = src_offset + mad24(gidy, src_step, gidx);
+            int p = (int)src[src_index];
+//	    p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p;
+            atomic_inc(subhist + p);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy];
+}
+
+__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf,
+                __global int* hist,
+                int src_step)
+{
+    int lx = get_local_id(0);
+    int gx = get_group_id(0);
+
+    int sum = 0;
+
+    for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE)
+        sum += buf[ mad24(i, src_step, gx)];
+
+    __local int data[HISTOGRAM256_WORK_GROUP_SIZE];
+    data[lx] = sum;
+
+    for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lx < stride)
+            data[lx] += data[lx + stride];
+    }
+
+    if(lx == 0)
+        hist[gx] = data[0];
+}
+
+__kernel __attribute__((reqd_work_group_size(256,1,1)))
+void calLUT(__global uchar * dst, __constant int * hist, int total)
+{
+    int lid = get_local_id(0);
+    __local int sumhist[HISTOGRAM256_BIN_COUNT];
+    __local float scale;
+
+    sumhist[lid] = hist[lid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (lid == 0)
+    {
+        int sum = 0, i = 0;
+        while (!sumhist[i])
+            ++i;
+
+        if (total == sumhist[i])
+        {
+            scale = 1;
+            for (int j = 0; j < HISTOGRAM256_BIN_COUNT; ++j)
+                sumhist[i] = i;
+        }
+        else
+        {
+            scale = 255.f/(total - sumhist[i]);
+
+            for (sumhist[i++] = 0; i < HISTOGRAM256_BIN_COUNT; i++)
+            {
+                sum += sumhist[i];
+                sumhist[i] = sum;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    dst[lid]= convert_uchar_sat_rte(convert_float(sumhist[lid])*scale);
+}
+
+/*
+///////////////////////////////equalizeHist//////////////////////////////////////////////////
+__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
+                            __global uchar * src,
+                            __global uchar * dst,
+                            __constant int * hist,
+                            int srcstep,
+                            int srcoffset,
+                            int dststep,
+                            int dstoffset,
+                            int width,
+                            int height,
+                            float scale,
+                            int inc_x,
+                            int inc_y)
+{
+    int gidx = get_global_id(0);
+    int lid = get_local_id(0);
+    int glb_size = get_global_size(0);
+    src+=srcoffset;
+    dst+=dstoffset;
+    __local int sumhist[HISTOGRAM256_BIN_COUNT];
+    __local uchar lut[HISTOGRAM256_BIN_COUNT+1];
+
+    sumhist[lid]=hist[lid];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lid==0)
+    {
+        int sum = 0;
+        for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++)
+        {
+            sum+=sumhist[i];
+            sumhist[i]=sum;
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale);
+    lut[0]=0;
+    int pos_y = gidx / width;
+    int pos_x = gidx - mul24(pos_y, width);
+
+    for(int pos = gidx; pos < mul24(width,height); pos += glb_size)
+    {
+        int inaddr = mad24(pos_y,srcstep,pos_x);
+        int outaddr = mad24(pos_y,dststep,pos_x);
+        dst[outaddr] = lut[src[inaddr]];
+        pos_x +=inc_x;
+        int off = (pos_x >= width ? -1 : 0);
+        pos_x =  mad24(off,width,pos_x);
+        pos_y += inc_y - off;
+    }
+}
+*/
--- a/modules/imgproc/src/opencl/hough.cl
+++ b/modules/imgproc/src/opencl/hough.cl
@ -0,0 +1,280 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+////////////////////////////////////////////////////////////////////////
+// buildPointList
+
+#define PIXELS_PER_THREAD 16
+
+// TODO: add offset to support ROI
+__kernel void buildPointList(__global const uchar* src,
+                             int cols,
+                             int rows,
+                             int step,
+                             __global unsigned int* list,
+                             __global int* counter)
+{
+    __local unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
+    __local int s_qsize[4];
+    __local int s_globStart[4];
+
+    const int x = get_group_id(0) * get_local_size(0) * PIXELS_PER_THREAD + get_local_id(0);
+    const int y = get_global_id(1);
+
+    if (get_local_id(0) == 0)
+        s_qsize[get_local_id(1)] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (y < rows)
+    {
+        // fill the queue
+        __global const uchar* srcRow = &src[y * step];
+        for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < cols; ++i, xx += get_local_size(0))
+        {
+            if (srcRow[xx])
+            {
+                const unsigned int val = (y << 16) | xx;
+                const int qidx = atomic_add(&s_qsize[get_local_id(1)], 1);
+                s_queues[get_local_id(1)][qidx] = val;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // let one work-item reserve the space required in the global list
+    if (get_local_id(0) == 0 && get_local_id(1) == 0)
+    {
+        // find how many items are stored in each list
+        int totalSize = 0;
+        for (int i = 0; i < get_local_size(1); ++i)
+        {
+            s_globStart[i] = totalSize;
+            totalSize += s_qsize[i];
+        }
+
+        // calculate the offset in the global list
+        const int globalOffset = atomic_add(counter, totalSize);
+        for (int i = 0; i < get_local_size(1); ++i)
+            s_globStart[i] += globalOffset;
+    }
+
+    barrier(CLK_GLOBAL_MEM_FENCE);
+
+    // copy local queues to global queue
+    const int qsize = s_qsize[get_local_id(1)];
+    int gidx = s_globStart[get_local_id(1)] + get_local_id(0);
+    for(int i = get_local_id(0); i < qsize; i += get_local_size(0), gidx += get_local_size(0))
+        list[gidx] = s_queues[get_local_id(1)][i];
+}
+
+////////////////////////////////////////////////////////////////////////
+// circlesAccumCenters
+
+// TODO: add offset to support ROI
+__kernel void circlesAccumCenters(__global const unsigned int* list,
+                                  const int count,
+                                  __global const int* dx,
+                                  const int dxStep,
+                                  __global const int* dy,
+                                  const int dyStep,
+                                  __global int* accum,
+                                  const int accumStep,
+                                  const int width,
+                                  const int height,
+                                  const int minRadius,
+                                  const int maxRadius,
+                                  const float idp)
+{
+    const int dxStepInPixel    = dxStep    / sizeof(int);
+    const int dyStepInPixel    = dyStep    / sizeof(int);
+    const int accumStepInPixel = accumStep / sizeof(int);
+
+    const int SHIFT = 10;
+    const int ONE = 1 << SHIFT;
+
+    // const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int wid = get_global_id(0);
+
+    if (wid >= count)
+        return;
+
+    const unsigned int val = list[wid];
+
+    const int x = (val & 0xFFFF);
+    const int y = (val >> 16) & 0xFFFF;
+
+    const int vx = dx[mad24(y, dxStepInPixel, x)];
+    const int vy = dy[mad24(y, dyStepInPixel, x)];
+
+    if (vx == 0 && vy == 0)
+        return;
+
+    const float mag = sqrt(convert_float(vx * vx + vy * vy));
+
+    const int x0 = convert_int_rte((x * idp) * ONE);
+    const int y0 = convert_int_rte((y * idp) * ONE);
+
+    int sx = convert_int_rte((vx * idp) * ONE / mag);
+    int sy = convert_int_rte((vy * idp) * ONE / mag);
+
+    // Step from minRadius to maxRadius in both directions of the gradient
+    for (int k1 = 0; k1 < 2; ++k1)
+    {
+        int x1 = x0 + minRadius * sx;
+        int y1 = y0 + minRadius * sy;
+
+        for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
+        {
+            const int x2 = x1 >> SHIFT;
+            const int y2 = y1 >> SHIFT;
+
+            if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
+                break;
+
+            atomic_add(&accum[mad24(y2+1, accumStepInPixel, x2+1)], 1);
+        }
+
+        sx = -sx;
+        sy = -sy;
+    }
+}
+
+// ////////////////////////////////////////////////////////////////////////
+// // buildCentersList
+
+// TODO: add offset to support ROI
+__kernel void buildCentersList(__global const int* accum,
+                               const int accumCols,
+                               const int accumRows,
+                               const int accumStep,
+                               __global unsigned int* centers,
+                               const int threshold,
+                               __global int* counter)
+{
+    const int accumStepInPixel = accumStep/sizeof(int);
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if (x < accumCols - 2 && y < accumRows - 2)
+    {
+        const int top    = accum[mad24(y,     accumStepInPixel, x + 1)];
+
+        const int left   = accum[mad24(y + 1, accumStepInPixel, x)];
+        const int cur    = accum[mad24(y + 1, accumStepInPixel, x + 1)];
+        const int right  = accum[mad24(y + 1, accumStepInPixel, x + 2)];
+
+        const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];;
+
+        if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
+        {
+            const unsigned int val = (y << 16) | x;
+            const int idx = atomic_add(counter, 1);
+            centers[idx] = val;
+        }
+    }
+}
+
+
+// ////////////////////////////////////////////////////////////////////////
+// // circlesAccumRadius
+
+// TODO: add offset to support ROI
+__kernel void circlesAccumRadius(__global const unsigned int* centers,
+                                 __global const unsigned int* list, const int count,
+                                 __global float4* circles, const int maxCircles,
+                                 const float dp,
+                                 const int minRadius, const int maxRadius,
+                                 const int histSize,
+                                 const int threshold,
+                                 __local int* smem,
+                                 __global int* counter)
+{
+    for (int i = get_local_id(0); i < histSize + 2; i += get_local_size(0))
+        smem[i] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    unsigned int val = centers[get_group_id(0)];
+
+    float cx = convert_float(val & 0xFFFF);
+    float cy = convert_float((val >> 16) & 0xFFFF);
+
+    cx = (cx + 0.5f) * dp;
+    cy = (cy + 0.5f) * dp;
+
+    for (int i = get_local_id(0); i < count; i += get_local_size(0))
+    {
+        val = list[i];
+
+        const int x = (val & 0xFFFF);
+        const int y = (val >> 16) & 0xFFFF;
+
+        const float rad = sqrt((cx - x) * (cx - x) + (cy - y) * (cy - y));
+        if (rad >= minRadius && rad <= maxRadius)
+        {
+            const int r = convert_int_rte(rad - minRadius);
+
+            atomic_add(&smem[r + 1], 1);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (int i = get_local_id(0); i < histSize; i += get_local_size(0))
+    {
+        const int curVotes = smem[i + 1];
+
+        if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
+
+        {
+            const int ind = atomic_add(counter, 1);
+            if (ind < maxCircles)
+            {
+                circles[ind] = (float4)(cx, cy, convert_float(i + minRadius), 0.0f);
+            }
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/integral.cl
+++ b/modules/imgproc/src/opencl/integral.cl
@ -0,0 +1,493 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+#define LSIZE 256
+#define LSIZE_1 255
+#define LSIZE_2 254
+#define HF_LSIZE 128
+#define LOG_LSIZE 8
+#define LOG_NUM_BANKS 5
+#define NUM_BANKS 32
+#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
+
+
+kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum,
+                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    float4 sqsum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
+    __local int* sum_p;
+    __local float* sqsum_p;
+    src_step = src_step >> 2;
+    gid = gid << 1;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0);
+        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0);
+
+        sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+        lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
+
+        lm_sum[1][bf_loc] = src_t[1];
+        lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+
+                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            lm_sqsum[0][bf_loc] += sqsum_t[0];
+            lm_sqsum[1][bf_loc] += sqsum_t[1];
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
+                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
+                sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
+            }
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
+                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
+                sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+
+kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum ,
+                          __global float *sqsum,int rows,int cols,int src_step,int sum_step,
+                          int sqsum_step,int sum_offset,int sqsum_offset)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    float4 sqsrc_t[2],sqsum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
+    __local int *sum_p;
+    __local float *sqsum_p;
+    src_step = src_step >> 4;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0;
+        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
+        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0;
+        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
+
+        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] =  (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+        lm_sqsum[0][bf_loc] = sqsrc_t[0];
+
+        lm_sum[1][bf_loc] = src_t[1];
+        lm_sqsum[1][bf_loc] = sqsrc_t[1];
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+
+                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(gid == 0 && (i + lid) <= rows)
+        {
+            sum[sum_offset + i + lid] = 0;
+            sqsum[sqsum_offset + i + lid] = 0;
+        }
+        if(i + lid == 0)
+        {
+            int loc0 = gid * 2 * sum_step;
+            int loc1 = gid * 2 * sqsum_step;
+            for(int k = 1; k <= 8; k++)
+            {
+                if(gid * 8 + k > cols) break;
+                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
+                sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
+            }
+        }
+        int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
+        int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            lm_sqsum[0][bf_loc] += sqsum_t[0];
+            lm_sqsum[1][bf_loc] += sqsum_t[1];
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + k >= cols) break;
+                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
+                sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
+            }
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + 4 + k >= cols) break;
+                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
+                sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum,
+                          int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    float4 src_t[2], sum_t[2];
+    float4 sqsum_t[2];
+    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
+    __local float* sum_p;
+    __local float* sqsum_p;
+    src_step = src_step >> 2;
+    gid = gid << 1;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0);
+        src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0);
+
+        sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+        lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]);
+
+        lm_sum[1][bf_loc] = src_t[1];
+        lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]);
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+
+                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            lm_sqsum[0][bf_loc] += sqsum_t[0];
+            lm_sqsum[1][bf_loc] += sqsum_t[1];
+            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
+                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
+                sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k];
+            }
+            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
+                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
+                sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+
+kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum ,
+                          __global float *sqsum,int rows,int cols,int src_step,int sum_step,
+                          int sqsum_step,int sum_offset,int sqsum_offset)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    float4 src_t[2], sum_t[2];
+    float4 sqsrc_t[2],sqsum_t[2];
+    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float4 lm_sqsum[2][LSIZE + LOG_LSIZE];
+    __local float *sum_p;
+    __local float *sqsum_p;
+    src_step = src_step >> 4;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
+        sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0;
+        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
+        sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
+
+        sum_t[0] =  (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[0] =  (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        sqsum_t[1] =  (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+        lm_sqsum[0][bf_loc] = sqsrc_t[0];
+
+        lm_sum[1][bf_loc] = src_t[1];
+        lm_sqsum[1][bf_loc] = sqsrc_t[1];
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][bi]  +=  lm_sqsum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+            lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+
+                lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai];
+                lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(gid == 0 && (i + lid) <= rows)
+        {
+            sum[sum_offset + i + lid] = 0;
+            sqsum[sqsum_offset + i + lid] = 0;
+        }
+        if(i + lid == 0)
+        {
+            int loc0 = gid * 2 * sum_step;
+            int loc1 = gid * 2 * sqsum_step;
+            for(int k = 1; k <= 8; k++)
+            {
+                if(gid * 8 + k > cols) break;
+                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
+                sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0;
+            }
+        }
+        int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
+        int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ;
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            lm_sqsum[0][bf_loc] += sqsum_t[0];
+            lm_sqsum[1][bf_loc] += sqsum_t[1];
+            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + k >= cols) break;
+                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
+                sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k];
+            }
+            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
+            sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + 4 + k >= cols) break;
+                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
+                sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
--- a/modules/imgproc/src/opencl/integral_sum.cl
+++ b/modules/imgproc/src/opencl/integral_sum.cl
@ -0,0 +1,412 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#define LSIZE 256
+#define LSIZE_1 255
+#define LSIZE_2 254
+#define HF_LSIZE 128
+#define LOG_LSIZE 8
+#define LOG_NUM_BANKS 5
+#define NUM_BANKS 32
+#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS)
+
+
+kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum ,
+                              int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local int* sum_p;
+    src_step = src_step >> 2;
+    gid = gid << 1;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0);
+        src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0);
+
+        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+
+        lm_sum[1][bf_loc] = src_t[1];
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
+                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
+            }
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
+                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+
+kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum ,
+                              int rows,int cols,int src_step,int sum_step,
+                              int sum_offset)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    int4 src_t[2], sum_t[2];
+    __local int4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local int *sum_p;
+    src_step = src_step >> 4;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0;
+        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0;
+
+        sum_t[0] =  (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+
+        lm_sum[1][bf_loc] = src_t[1];
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(gid == 0 && (i + lid) <= rows)
+        {
+            sum[sum_offset + i + lid] = 0;
+        }
+        if(i + lid == 0)
+        {
+            int loc0 = gid * 2 * sum_step;
+            for(int k = 1; k <= 8; k++)
+            {
+                if(gid * 8 + k > cols) break;
+                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
+            }
+        }
+
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            sum_p = (__local int*)(&(lm_sum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + k >= cols) break;
+                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
+            }
+            sum_p = (__local int*)(&(lm_sum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + 4 + k >= cols) break;
+                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum ,
+                              int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    float4 src_t[2], sum_t[2];
+    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float* sum_p;
+    src_step = src_step >> 2;
+    gid = gid << 1;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0);
+        src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0);
+
+        sum_t[0] =  (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+
+        lm_sum[1][bf_loc] = src_t[1];
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ;
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue;
+                sum[loc_s0 + k * dst_step / 4] = sum_p[k];
+            }
+            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 4 + k + 4 >= cols + pre_invalid) break;
+                sum[loc_s1 + k * dst_step / 4] = sum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
+
+
+kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum ,
+                              int rows,int cols,int src_step,int sum_step,
+                              int sum_offset)
+{
+    unsigned int lid = get_local_id(0);
+    unsigned int gid = get_group_id(0);
+    float4 src_t[2], sum_t[2];
+    __local float4 lm_sum[2][LSIZE + LOG_LSIZE];
+    __local float *sum_p;
+    src_step = src_step >> 4;
+    for(int i = 0; i < rows; i =i + LSIZE_1)
+    {
+        src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0;
+        src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0;
+
+        sum_t[0] =  (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]);
+        sum_t[1] =  (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]);
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int bf_loc = lid + GET_CONFLICT_OFFSET(lid);
+        lm_sum[0][bf_loc] = src_t[0];
+
+        lm_sum[1][bf_loc] = src_t[1];
+
+        int offset = 1;
+        for(int d = LSIZE >> 1 ;  d > 0; d>>=1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi]  +=  lm_sum[lid >> 7][ai];
+            }
+            offset <<= 1;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < 2)
+        {
+            lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0;
+        }
+        for(int d = 1;  d < LSIZE; d <<= 1)
+        {
+            barrier(CLK_LOCAL_MEM_FENCE);
+            offset >>= 1;
+            int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset;
+            ai += GET_CONFLICT_OFFSET(ai);
+            bi += GET_CONFLICT_OFFSET(bi);
+
+            if((lid & 127) < d)
+            {
+                lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai];
+                lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(gid == 0 && (i + lid) <= rows)
+        {
+            sum[sum_offset + i + lid] = 0;
+        }
+        if(i + lid == 0)
+        {
+            int loc0 = gid * 2 * sum_step;
+            for(int k = 1; k <= 8; k++)
+            {
+                if(gid * 8 + k > cols) break;
+                sum[sum_offset + loc0 + k * sum_step / 4] = 0;
+            }
+        }
+
+        if(lid > 0 && (i+lid) <= rows)
+        {
+            int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ;
+            lm_sum[0][bf_loc] += sum_t[0];
+            lm_sum[1][bf_loc] += sum_t[1];
+            sum_p = (__local float*)(&(lm_sum[0][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + k >= cols) break;
+                sum[loc_s0 + k * sum_step / 4] = sum_p[k];
+            }
+            sum_p = (__local float*)(&(lm_sum[1][bf_loc]));
+            for(int k = 0; k < 4; k++)
+            {
+                if(gid * 8 + 4 + k >= cols) break;
+                sum[loc_s1 + k * sum_step / 4] = sum_p[k];
+            }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+}
--- a/modules/imgproc/src/opencl/laplacian.cl
+++ b/modules/imgproc/src/opencl/laplacian.cl
@ -0,0 +1,381 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Pang Erping, erping@multicorewareinc.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Peng Xiao, pengxiao@outlook.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? ((l_edge)<<1)-(i)-1                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? ((t_edge)<<1)-(i)-1                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT_101
+//BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? ((l_edge)<<1)-(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? ((t_edge)<<1)-(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef IMG_C_1_0
+#define T_IMG   uchar
+#define T_IMGx4 uchar4
+#define T_IMG_C1 uchar
+#define CONVERT_TYPE   convert_uchar_sat
+#define CONVERT_TYPEx4 convert_uchar4_sat
+#endif
+#ifdef IMG_C_4_0
+#define T_IMG   uchar4
+#define T_IMGx4 uchar16
+#define T_IMG_C1 uchar
+#define CONVERT_TYPE   convert_uchar4_sat
+#define CONVERT_TYPEx4 convert_uchar16_sat
+#endif
+#ifdef IMG_C_1_5
+#define T_IMG   float
+#define T_IMGx4 float4
+#define T_IMG_C1 float
+#define CONVERT_TYPE   convert_float
+#define CONVERT_TYPEx4 convert_float4
+#endif
+#ifdef IMG_C_4_5
+#define T_IMG   float4
+#define T_IMGx4 float16
+#define T_IMG_C1 float
+#define CONVERT_TYPE   convert_float4
+#define CONVERT_TYPEx4 convert_float16
+#endif
+
+#ifndef CN
+#define CN 1
+#endif
+
+#if CN == 1
+#define T_SUM   float
+#define T_SUMx4 float4
+#define CONVERT_TYPE_SUM   convert_float
+#define CONVERT_TYPE_SUMx4 convert_float4
+#define SUM_ZERO   (0.0f)
+#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f)
+#define VLOAD4 vload4
+#define SX x
+#define SY y
+#define SZ z
+#define SW w
+#elif CN == 4
+#define T_SUM float4
+#define T_SUMx4 float16
+#define CONVERT_TYPE_SUM   convert_float4
+#define CONVERT_TYPE_SUMx4 convert_float16
+#define SUM_ZERO   (0.0f, 0.0f, 0.0f, 0.0f)
+#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)
+#define VLOAD4 vload16
+#define SX s0123
+#define SY s4567
+#define SZ s89ab
+#define SW scdef
+#endif
+
+#ifndef FILTER_SIZE
+#define FILTER_SIZE 3
+#endif
+
+#define LOCAL_GROUP_SIZE 16
+
+#define LOCAL_WIDTH  ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
+#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE)
+
+#define FILTER_RADIUS (FILTER_SIZE >> 1)
+
+__kernel void filter2D(
+    __global T_IMG *src,
+    __global T_IMG *dst,
+    int src_step,
+    int dst_step,
+    __constant float *mat_kernel,
+    __local T_IMG *local_data,
+    int wholerows,
+    int wholecols,
+    int src_offset_x,
+    int src_offset_y,
+    int dst_offset_x,
+    int dst_offset_y,
+    int cols,
+    int rows,
+    int operate_cols
+)
+{
+    int groupStartCol = get_group_id(0) * get_local_size(0);
+    int groupStartRow = get_group_id(1) * get_local_size(1);
+
+    int localCol = get_local_id(0);
+    int localRow = get_local_id(1);
+    int globalCol = groupStartCol + localCol;
+    int globalRow = groupStartRow + localRow;
+    const int src_offset = mad24(src_offset_y, src_step, src_offset_x);
+    const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x);
+
+#ifdef BORDER_CONSTANT
+    for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
+    {
+        int curRow = groupStartRow + i;
+        for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
+        {
+            int curCol = groupStartCol + j;
+            if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y||
+                curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x)
+            {
+                local_data[(i) * LOCAL_WIDTH + j] = 0;
+            }
+            else
+            {
+                local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset];
+            }
+        }
+    }
+#else
+    for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1))
+    {
+        int curRow = groupStartRow + i;
+
+        curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y);
+
+        curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS);
+
+        for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0))
+        {
+            int curCol = groupStartCol + j;
+            curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x);
+            curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS);
+            if(curRow < wholerows  && curCol < wholecols)
+            {
+                local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset];
+            }
+        }
+    }
+#endif
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(globalRow < rows && globalCol < cols)
+    {
+        T_SUM sum = (T_SUM)(SUM_ZERO);
+        int filterIdx = 0;
+        for(int i = 0; i < FILTER_SIZE; i++)
+        {
+            int offset = (i + localRow) * LOCAL_WIDTH;
+
+            for(int j = 0; j < FILTER_SIZE; j++)
+            {
+                sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++];
+            }
+        }
+        dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum);
+    }
+}
+
+/// following is specific for 3x3 kernels
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////Macro for define elements number per thread/////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define ANX                     1
+#define ANY                     1
+
+#define ROWS_PER_GROUP          4
+#define ROWS_PER_GROUP_BITS     2
+#define ROWS_FETCH              (ROWS_PER_GROUP + ANY + ANY)   //(ROWS_PER_GROUP + anY * 2)
+
+#define THREADS_PER_ROW         64
+#define THREADS_PER_ROW_BIT     6
+
+#define ELEMENTS_PER_THREAD     4
+#define ELEMENTS_PER_THREAD_BIT 2
+
+#define LOCAL_MEM_STEP          260 //divup((get_local_size(0) + anX * 2), 4) * 4
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////8uC1////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void filter2D_3x3(
+    __global T_IMG *src,
+    __global T_IMG *dst,
+    int src_step,
+    int dst_step,
+    __constant float *mat_kernel,
+    __local T_IMG *local_data,
+    int wholerows,
+    int wholecols,
+    int src_offset_x,
+    int src_offset_y,
+    int dst_offset_x,
+    int dst_offset_y,
+    int cols,
+    int rows,
+    int operate_cols
+)
+{
+    int gX = get_global_id(0);
+    int gY = get_global_id(1);
+
+    int lX = get_local_id(0);
+
+    int groupX_size = get_local_size(0);
+    int groupX_id   = get_group_id(0);
+
+#define dst_align (dst_offset_x & 3)
+    int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX;
+    int rows_start_index       = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY;
+
+    if((gY << 2) < rows)
+    {
+        for(int i = 0; i < ROWS_FETCH; ++i)
+        {
+            if((rows_start_index - src_offset_y) + i < rows + ANY)
+            {
+#ifdef BORDER_CONSTANT
+                int selected_row  = rows_start_index + i;
+                int selected_cols = cols_start_index_group + lX;
+
+                T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
+                int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
+                data = con ? data : (T_IMG)(0);
+                local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+
+                    data  = src[mad24(selected_row, src_step, selected_cols)];
+                    con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols;
+                    data = con ? data : (T_IMG)(0);
+                    local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
+                }
+#else
+                int selected_row = ADDR_H(rows_start_index + i,  0, wholerows);
+                selected_row     = ADDR_B(rows_start_index + i, wholerows, selected_row);
+
+                int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols);
+                selected_cols     = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols);
+
+                T_IMG data = src[mad24(selected_row, src_step, selected_cols)];
+
+                local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data;
+
+                if(lX < (ANX << 1))
+                {
+                    selected_cols = cols_start_index_group + lX + groupX_size;
+                    selected_cols = ADDR_R(selected_cols, wholecols, selected_cols);
+
+                    data = src[mad24(selected_row, src_step, selected_cols)];
+                    local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data;
+                }
+#endif
+            }
+        }
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2);
+    if(((gY << 2) < rows) && (process_col < operate_cols))
+    {
+        int dst_cols_start = dst_offset_x;
+        int dst_cols_end   = dst_offset_x + cols;
+        int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc;
+
+        int dst_rows_end   = dst_offset_y + rows;
+        int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT);
+        dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index);
+
+        T_IMGx4 dst_data = *(__global T_IMGx4 *)dst;
+
+        T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4;
+        T_IMGx4 data;
+
+        for(int i = 0; i < FILTER_SIZE; i++)
+        {
+#pragma unroll
+            for(int j = 0; j < FILTER_SIZE; j++)
+            {
+                if(dst_rows_index < dst_rows_end)
+                {
+                    int local_row = (lX >> THREADS_PER_ROW_BIT) + i;
+                    int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j;
+
+                    data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols));
+                    sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data));
+                }
+            }
+        }
+
+        if(dst_rows_index < dst_rows_end)
+        {
+            T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum);
+            tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ?
+                         tmp_dst.SX : dst_data.SX;
+            tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ?
+                         tmp_dst.SY : dst_data.SY;
+            tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ?
+                         tmp_dst.SZ : dst_data.SZ;
+            tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ?
+                         tmp_dst.SW : dst_data.SW;
+            *(__global T_IMGx4 *)dst = tmp_dst;
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/match_template.cl
+++ b/modules/imgproc/src/opencl/match_template.cl
@ -0,0 +1,857 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+#if defined (DOUBLE_SUPPORT)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+
+#define TYPE_IMAGE_SQSUM double
+#else
+#define TYPE_IMAGE_SQSUM float
+#endif
+
+#ifndef CN4
+#define CN4 1
+#else
+#define CN4 4
+#endif
+
+//////////////////////////////////////////////////
+// utilities
+#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4)
+#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox)
+// normAcc* are accurate normalization routines which make GPU matchTemplate
+// consistent with CPU one
+float normAcc(float num, float denum)
+{
+    if(fabs(num) < denum)
+    {
+        return num / denum;
+    }
+    if(fabs(num) < denum * 1.125f)
+    {
+        return num > 0 ? 1 : -1;
+    }
+    return 0;
+}
+
+float normAcc_SQDIFF(float num, float denum)
+{
+    if(fabs(num) < denum)
+    {
+        return num / denum;
+    }
+    if(fabs(num) < denum * 1.125f)
+    {
+        return num > 0 ? 1 : -1;
+    }
+    return 1;
+}
+//////////////////////////////////////////////////////////////////////
+// normalize
+
+__kernel
+void normalizeKernel_C1_D0
+(
+    __global const float * img_sqsums,
+    __global float * res,
+    ulong tpl_sqsum,
+    int res_rows,
+    int res_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int img_sqsums_offset,
+    int img_sqsums_step,
+    int res_offset,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+    img_sqsums_step /= sizeof(*img_sqsums);
+    img_sqsums_offset /= sizeof(*img_sqsums);
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        float image_sqsum_ = (float)(
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+        res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum));
+    }
+}
+
+__kernel
+void matchTemplate_Prepared_SQDIFF_C1_D0
+(
+    __global const TYPE_IMAGE_SQSUM * img_sqsums,
+    __global float * res,
+    ulong tpl_sqsum,
+    int res_rows,
+    int res_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int img_sqsums_offset,
+    int img_sqsums_step,
+    int res_offset,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+    img_sqsums_step /= sizeof(*img_sqsums);
+    img_sqsums_offset /= sizeof(*img_sqsums);
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        float image_sqsum_ = (float)(
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+        res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum;
+    }
+}
+
+__kernel
+void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0
+(
+    __global const float * img_sqsums,
+    __global float * res,
+    ulong tpl_sqsum,
+    int res_rows,
+    int res_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int img_sqsums_offset,
+    int img_sqsums_step,
+    int res_offset,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+    img_sqsums_step /= sizeof(*img_sqsums);
+    img_sqsums_offset /= sizeof(*img_sqsums);
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        float image_sqsum_ = (float)(
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+        res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum,
+                                      sqrt(image_sqsum_ * tpl_sqsum));
+    }
+}
+
+//////////////////////////////////////////////////
+// SQDIFF
+__kernel
+void matchTemplate_Naive_SQDIFF_C1_D0
+(
+    __global const uchar * img,
+    __global const uchar * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    int delta;
+    int sum = 0;
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                delta = img_ptr[j] - tpl_ptr[j];
+                sum   = mad24(delta, delta, sum);
+            }
+        }
+        res[res_idx] = sum;
+    }
+}
+
+__kernel
+void matchTemplate_Naive_SQDIFF_C1_D5
+(
+    __global const float * img,
+    __global const float * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float delta;
+    float sum = 0;
+    img_step   /= sizeof(*img);
+    img_offset /= sizeof(*img);
+    tpl_step   /= sizeof(*tpl);
+    tpl_offset /= sizeof(*tpl);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                delta = img_ptr[j] - tpl_ptr[j];
+                sum   = mad(delta, delta, sum);
+            }
+        }
+        res[res_idx] = sum;
+    }
+}
+
+__kernel
+void matchTemplate_Naive_SQDIFF_C4_D0
+(
+    __global const uchar4 * img,
+    __global const uchar4 * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    int4 delta;
+    int4 sum = (int4)(0, 0, 0, 0);
+    img_step   /= sizeof(*img);
+    img_offset /= sizeof(*img);
+    tpl_step   /= sizeof(*tpl);
+    tpl_offset /= sizeof(*tpl);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+                delta.x = img_ptr[j].x - tpl_ptr[j].x;
+                delta.y = img_ptr[j].y - tpl_ptr[j].y;
+                delta.z = img_ptr[j].z - tpl_ptr[j].z;
+                delta.w = img_ptr[j].w - tpl_ptr[j].w;
+                sum   = mad24(delta, delta, sum);
+            }
+        }
+        res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+    }
+}
+
+__kernel
+void matchTemplate_Naive_SQDIFF_C4_D5
+(
+    __global const float4 * img,
+    __global const float4 * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float4 delta;
+    float4 sum = (float4)(0, 0, 0, 0);
+    img_step   /= sizeof(*img);
+    img_offset /= sizeof(*img);
+    tpl_step   /= sizeof(*tpl);
+    tpl_offset /= sizeof(*tpl);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                //delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect
+                delta.x = img_ptr[j].x - tpl_ptr[j].x;
+                delta.y = img_ptr[j].y - tpl_ptr[j].y;
+                delta.z = img_ptr[j].z - tpl_ptr[j].z;
+                delta.w = img_ptr[j].w - tpl_ptr[j].w;
+                sum   = mad(delta, delta, sum);
+            }
+        }
+        res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+    }
+}
+
+//////////////////////////////////////////////////
+// CCORR
+__kernel
+void matchTemplate_Naive_CCORR_C1_D0
+(
+    __global const uchar * img,
+    __global const uchar * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    int sum = 0;
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum);
+            }
+        }
+        res[res_idx] = (float)sum;
+    }
+}
+
+__kernel
+void matchTemplate_Naive_CCORR_C1_D5
+(
+    __global const float * img,
+    __global const float * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float sum = 0;
+    img_step   /= sizeof(*img);
+    img_offset /= sizeof(*img);
+    tpl_step   /= sizeof(*tpl);
+    tpl_offset /= sizeof(*tpl);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                sum = mad(img_ptr[j], tpl_ptr[j], sum);
+            }
+        }
+        res[res_idx] = sum;
+    }
+}
+
+__kernel
+void matchTemplate_Naive_CCORR_C4_D0
+(
+    __global const uchar4 * img,
+    __global const uchar4 * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    int4 sum = (int4)(0, 0, 0, 0);
+    img_step   /= sizeof(*img);
+    img_offset /= sizeof(*img);
+    tpl_step   /= sizeof(*tpl);
+    tpl_offset /= sizeof(*tpl);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                sum   = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum);
+            }
+        }
+        res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w);
+    }
+}
+
+__kernel
+void matchTemplate_Naive_CCORR_C4_D5
+(
+    __global const float4 * img,
+    __global const float4 * tpl,
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int img_offset,
+    int tpl_offset,
+    int res_offset,
+    int img_step,
+    int tpl_step,
+    int res_step
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int i,j;
+    float4 sum = (float4)(0, 0, 0, 0);
+    img_step   /= sizeof(*img);
+    img_offset /= sizeof(*img);
+    tpl_step   /= sizeof(*tpl);
+    tpl_offset /= sizeof(*tpl);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        for(i = 0; i < tpl_rows; i ++)
+        {
+            // get specific rows of img data
+            __global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset);
+            __global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset);
+            for(j = 0; j < tpl_cols; j ++)
+            {
+                sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum);
+            }
+        }
+        res[res_idx] = sum.x + sum.y + sum.z + sum.w;
+    }
+}
+
+//////////////////////////////////////////////////
+// CCOFF
+__kernel
+void matchTemplate_Prepared_CCOFF_C1_D0
+(
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int res_offset,
+    int res_step,
+    __global const uint * img_sums,
+    int img_sums_offset,
+    int img_sums_step,
+    float tpl_sum
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= sizeof(*img_sums);
+    img_sums_step     /= sizeof(*img_sums);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+                            -(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+        res[res_idx] -= sum * tpl_sum;
+    }
+}
+__kernel
+void matchTemplate_Prepared_CCOFF_C4_D0
+(
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int res_offset,
+    int res_step,
+    __global const uint * img_sums_c0,
+    __global const uint * img_sums_c1,
+    __global const uint * img_sums_c2,
+    __global const uint * img_sums_c3,
+    int img_sums_offset,
+    int img_sums_step,
+    float tpl_sum_c0,
+    float tpl_sum_c1,
+    float tpl_sum_c2,
+    float tpl_sum_c3
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sums_offset   /= sizeof(*img_sums_c0);
+    img_sums_step     /= sizeof(*img_sums_c0);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        float ccorr = res[res_idx];
+        ccorr -= tpl_sum_c0*(float)(
+                     (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+        ccorr -= tpl_sum_c1*(float)(
+                     (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+        ccorr -= tpl_sum_c2*(float)(
+                     (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+        ccorr -= tpl_sum_c3*(float)(
+                     (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+                     - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+        res[res_idx] = ccorr;
+    }
+}
+
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C1_D0
+(
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int res_offset,
+    int res_step,
+    float weight,
+    __global const uint * img_sums,
+    int img_sums_offset,
+    int img_sums_step,
+    __global const float * img_sqsums,
+    int img_sqsums_offset,
+    int img_sqsums_step,
+    float tpl_sum,
+    float tpl_sqsum
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sqsums_step   /= sizeof(*img_sqsums);
+    img_sqsums_offset /= sizeof(*img_sqsums);
+    img_sums_offset   /= sizeof(*img_sums);
+    img_sums_step     /= sizeof(*img_sums);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        float image_sum_ =  (float)(
+                                (img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)])
+                                - (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)]));
+
+        float image_sqsum_ = (float)(
+                                 (img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) -
+                                 (img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)]));
+        res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum,
+                               sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+    }
+}
+__kernel
+void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
+(
+    __global float * res,
+    int img_rows,
+    int img_cols,
+    int tpl_rows,
+    int tpl_cols,
+    int res_rows,
+    int res_cols,
+    int res_offset,
+    int res_step,
+    float weight,
+    __global const uint * img_sums_c0,
+    __global const uint * img_sums_c1,
+    __global const uint * img_sums_c2,
+    __global const uint * img_sums_c3,
+    int img_sums_offset,
+    int img_sums_step,
+    __global const float * img_sqsums_c0,
+    __global const float * img_sqsums_c1,
+    __global const float * img_sqsums_c2,
+    __global const float * img_sqsums_c3,
+    int img_sqsums_offset,
+    int img_sqsums_step,
+    float tpl_sum_c0,
+    float tpl_sum_c1,
+    float tpl_sum_c2,
+    float tpl_sum_c3,
+    float tpl_sqsum
+)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+
+    img_sqsums_step   /= sizeof(*img_sqsums_c0);
+    img_sqsums_offset /= sizeof(*img_sqsums_c0);
+    img_sums_offset   /= sizeof(*img_sums_c0);
+    img_sums_step     /= sizeof(*img_sums_c0);
+    res_step   /= sizeof(*res);
+    res_offset /= sizeof(*res);
+
+    int res_idx = mad24(gidy, res_step, res_offset + gidx);
+
+    if(gidx < res_cols && gidy < res_rows)
+    {
+        float image_sum_c0 =  (float)(
+                                  (img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)]));
+        float image_sum_c1 =  (float)(
+                                  (img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)]));
+        float image_sum_c2 =  (float)(
+                                  (img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)]));
+        float image_sum_c3 =  (float)(
+                                  (img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)])
+                                  - (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)]));
+
+        float image_sqsum_c0 = (float)(
+                                   (img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)]));
+        float image_sqsum_c1 = (float)(
+                                   (img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)]));
+        float image_sqsum_c2 = (float)(
+                                   (img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)]));
+        float image_sqsum_c3 = (float)(
+                                   (img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) -
+                                   (img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)]));
+
+        float num = res[res_idx] -
+                    image_sum_c0 * tpl_sum_c0 -
+                    image_sum_c1 * tpl_sum_c1 -
+                    image_sum_c2 * tpl_sum_c2 -
+                    image_sum_c3 * tpl_sum_c3;
+        float denum = sqrt( tpl_sqsum * (
+                                image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 +
+                                image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 +
+                                image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 +
+                                image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3)
+                          );
+        res[res_idx] = normAcc(num, denum);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// extractFirstChannel
+__kernel
+void extractFirstChannel
+(
+    const __global float4* img,
+    __global float* res,
+    int rows,
+    int cols,
+    int img_offset,
+    int res_offset,
+    int img_step,
+    int res_step
+)
+{
+    img_step   /= sizeof(float4);
+    res_step   /= sizeof(float);
+    img_offset /= sizeof(float4);
+    res_offset /= sizeof(float);
+    img += img_offset;
+    res += res_offset;
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    if(gidx < cols && gidy < rows)
+    {
+        res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x;
+    }
+}
--- a/modules/imgproc/src/opencl/median.cl
+++ b/modules/imgproc/src/opencl/median.cl
@ -0,0 +1,486 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin, zero.lin@amd.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+
+/*
+__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep, int m)
+{
+    int dx = get_global_id(0)-(m>>1);
+    int dy = get_global_id(1)-(m>>1);
+
+    short histom[256];
+    for(int i=0;i<256;++i)
+        histom[i]=0;
+
+
+    for(int i=0;i<m;++i)
+    {
+        __global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1));
+        for(int j=dx;j<dx+m;++j)
+        {
+            histom[data[clamp(j, 0, cols-1)]]++;
+        }
+    }
+
+    int now=0;
+    int goal=(m*m+1)>>1;
+    int v;
+    for(int i=0;i<256;++i)
+    {
+        v=(now<goal?i:v);
+        now+=histom[i];
+    }
+
+    if(dy<rows && dx<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v;
+}
+*/
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local uchar4 data[18][18];
+    __global uchar4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    uchar4 mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local uchar data[18][18];
+    __global uchar* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    uchar mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local float data[18][18];
+    __global float* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    float mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local float4 data[18][18];
+    __global float4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -1;
+    int dy = get_global_id(1) - get_local_id(1) -1;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1);
+
+    int dr=id/18;
+    int dc=id%18;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+9, 0, rows-1);
+    data[dr+9][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2];
+    float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2];
+    float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2];
+    float4 mid;
+
+    op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1);
+    op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5);
+    op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7);
+    op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7);
+    op(p4, p2); op(p6, p4); op(p4, p2);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local uchar4 data[20][20];
+    __global uchar4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    uchar4 mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local uchar data[20][20];
+    __global uchar* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    uchar mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local float4 data[20][20];
+    __global float4* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    float4 mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
+
+#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);}
+__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  int srcOffset, int dstOffset, int cols,
+                                int rows, int srcStep, int dstStep)
+{
+
+    __local float data[20][20];
+    __global float* source=src + srcOffset;
+
+    int dx = get_global_id(0) - get_local_id(0) -2;
+    int dy = get_global_id(1) - get_local_id(1) -2;
+
+    const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1);
+
+    int dr=id/20;
+    int dc=id%20;
+    int r=clamp(dy+dr, 0, rows-1);
+    int c=clamp(dx+dc, 0, cols-1);
+
+    data[dr][dc] = source[r*srcStep + c];
+    r=clamp(dy+dr+10, 0, rows-1);
+    data[dr+10][dc] = source[r*srcStep + c];
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int x =get_local_id(0);
+    int y =get_local_id(1);
+    float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4];
+    float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4];
+    float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4];
+    float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4];
+    float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4];
+    float mid;
+
+    op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4);
+    op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4);
+    op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8);
+    op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11);
+    op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6);
+    op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8);
+    op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5);
+    op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8);
+    op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17);
+    op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15);
+    op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19);
+    op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24);
+    op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22);
+    op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18);
+    op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16);
+    op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19);
+    op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16);
+    op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12);
+    op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16);
+    op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10);
+    op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17);
+    op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19);
+    op(p7, p11); op(p11, p13); op(p11, p12);
+
+    if(get_global_id(1)<rows && get_global_id(0)<cols)
+        dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
+}
+#undef op(a,b)
--- a/modules/imgproc/src/opencl/mineigenval.cl
+++ b/modules/imgproc/src/opencl/mineigenval.cl
@ -0,0 +1,207 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Shengen Yan,yanshengen@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (l_edge)   : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (t_edge)   :(i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)-1               : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT101
+//BORDER_REFLECT101:   gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? -(i)                 : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? -(i)                 : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge)  ((i) <  (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr)    ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge)  ((i) <  (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr)    ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+#define THREADS 256
+#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////calcHarris////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
+                              int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
+                              int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              float k)
+{
+    int col = get_local_id(0);
+    const int gX = get_group_id(0);
+    const int gY = get_group_id(1);
+    const int glx = get_global_id(0);
+    const int gly = get_global_id(1);
+
+    int dx_x_off = (dx_offset % dx_step) >> 2;
+    int dx_y_off = dx_offset / dx_step;
+    int dy_x_off = (dy_offset % dy_step) >> 2;
+    int dy_y_off = dy_offset / dy_step;
+    int dst_x_off = (dst_offset % dst_step) >> 2;
+    int dst_y_off = dst_offset / dst_step;
+
+    int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off;
+    int dx_startY = (gY << 1) - anY + dx_y_off;
+    int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off;
+    int dy_startY = (gY << 1) - anY + dy_y_off;
+    int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
+    int dst_startY = (gY << 1) + dst_y_off;
+
+    float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+    __local float temp[6][THREADS];
+#ifdef BORDER_CONSTANT
+    bool dx_con,dy_con;
+    float dx_s,dy_s;
+    for(int i=0; i < ksY+1; i++)
+    {
+        dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
+        dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
+        dx_data[i] = dx_con ? dx_s : 0.0;
+        dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
+        dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
+        dy_data[i] = dy_con ? dy_s : 0.0;
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#else
+    int clamped_col = min(dst_cols, col);
+
+    for(int i=0; i < ksY+1; i++)
+    {
+        int dx_selected_row;
+        int dx_selected_col;
+        dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
+        dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
+        dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
+        dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
+        dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
+
+        int dy_selected_row;
+        int dy_selected_col;
+        dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
+        dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
+        dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
+        dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
+        dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
+
+        data[0][i] = dx_data[i] * dx_data[i];
+        data[1][i] = dx_data[i] * dy_data[i];
+        data[2][i] = dy_data[i] * dy_data[i];
+    }
+#endif
+    float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
+    for(int i=1; i < ksY; i++)
+    {
+        sum0 += (data[0][i]);
+        sum1 += (data[1][i]);
+        sum2 += (data[2][i]);
+    }
+    float sum01,sum02,sum11,sum12,sum21,sum22;
+    sum01 = sum0 + (data[0][0]);
+    sum02 = sum0 + (data[0][ksY]);
+    temp[0][col] = sum01;
+    temp[1][col] = sum02;
+    sum11 = sum1 + (data[1][0]);
+    sum12 = sum1 + (data[1][ksY]);
+    temp[2][col] = sum11;
+    temp[3][col] = sum12;
+    sum21 = sum2 + (data[2][0]);
+    sum22 = sum2 + (data[2][ksY]);
+    temp[4][col] = sum21;
+    temp[5][col] = sum22;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(col < (THREADS-(ksX-1)))
+    {
+        col += anX;
+        int posX = dst_startX - dst_x_off + col - anX;
+        int posY = (gly << 1);
+        int till = (ksX + 1)%2;
+        float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
+        for(int k=0; k<6; k++)
+            for(int i=-anX; i<=anX - till; i++)
+            {
+                tmp_sum[k] += temp[k][col+i];
+            }
+
+        if(posX < dst_cols && (posY) < dst_rows)
+        {
+            float a = tmp_sum[0] * 0.5f;
+            float b = tmp_sum[2];
+            float c = tmp_sum[4] * 0.5f;
+            dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+        if(posX < dst_cols && (posY + 1) < dst_rows)
+        {
+            float a = tmp_sum[1] * 0.5f;
+            float b = tmp_sum[3];
+            float c = tmp_sum[5] * 0.5f;
+            dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/moments.cl
+++ b/modules/imgproc/src/opencl/moments.cl
@ -0,0 +1,980 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Sen Liu, swjtuls1987@126.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+typedef double T;
+typedef double F;
+typedef double4 F4;
+#define convert_F4 convert_double4
+
+#else
+typedef float F;
+typedef float4 F4;
+typedef long T;
+#define convert_F4 convert_float4
+#endif
+
+#define DST_ROW_00     0
+#define DST_ROW_10     1
+#define DST_ROW_01     2
+#define DST_ROW_20     3
+#define DST_ROW_11     4
+#define DST_ROW_02     5
+#define DST_ROW_30     6
+#define DST_ROW_21     7
+#define DST_ROW_12     8
+#define DST_ROW_03     9
+
+__kernel void icvContourMoments(int contour_total,
+                                __global float* reader_oclmat_data,
+                                __global T* dst_a,
+                                int dst_step)
+{
+    T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
+    int idx = get_global_id(0);
+
+    if (idx < 0 || idx >= contour_total)
+        return;
+
+    xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
+    yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
+    xi_12 = xi_1 * xi_1;
+    yi_12 = yi_1 * yi_1;
+
+    if(idx == contour_total - 1)
+    {
+        xi = (T)(*(reader_oclmat_data));
+        yi = (T)(*(reader_oclmat_data + 1));
+    }
+    else
+    {
+        xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
+        yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
+    }
+
+    xi2 = xi * xi;
+    yi2 = yi * yi;
+    dxy = xi_1 * yi - xi * yi_1;
+    xii_1 = xi_1 + xi;
+    yii_1 = yi_1 + yi;
+
+    dst_step /= sizeof(T);
+    *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
+    *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
+    *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
+    *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
+    *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
+    *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
+    *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
+    *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
+    *( dst_a + DST_ROW_21 * dst_step + idx) =
+        dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
+               xi2 * (yi_1 + 3 * yi));
+    *( dst_a + DST_ROW_12 * dst_step + idx) =
+        dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
+               yi2 * (xi_1 + 3 * xi));
+}
+
+__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE,
+                      __global F* sum, __global F* dst_m, int dst_step)
+{
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int block_y = src_rows/tile_height;
+    int block_x = src_cols/tile_width;
+    int block_num;
+
+    if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0)
+        block_y ++;
+    if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0)
+        block_x ++;
+    block_num = block_y * block_x;
+    __local F dst_sum[10][128];
+    if(gidy<128-block_num)
+        for(int i=0; i<10; i++)
+            dst_sum[i][gidy+block_num]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    dst_step /= sizeof(F);
+    if(gidy<block_num)
+    {
+        dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy));
+        dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy));
+        dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy));
+        dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy));
+        dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy));
+        dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy));
+        dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy));
+        dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy));
+        dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy));
+        dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy));
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(int lsize=64; lsize>0; lsize>>=1)
+    {
+        if(gidy<lsize)
+        {
+            int lsize2 = gidy + lsize;
+            for(int i=0; i<10; i++)
+                dst_sum[i][gidy] += dst_sum[i][lsize2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(gidy==0)
+        for(int i=0; i<10; i++)
+            sum[i] = dst_sum[i][0];
+}
+
+__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int depth, int cn, int coi, int binary, int TILE_SIZE)
+{
+    uchar tmp_coi[16]; // get the coi data
+    uchar16 tmp[16];
+    int VLEN_C = 16;  // vector length of uchar
+
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE; // vector length of uchar
+    int x = wgidx*TILE_SIZE;  // vector length of uchar
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if( tileSize_width < TILE_SIZE )
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0;
+
+        if( coi > 0 )	//channel of interest
+            for(int i = 0; i < tileSize_width; i += VLEN_C)
+            {
+                for(int j=0; j<VLEN_C; j++)
+                    tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7],
+                                          tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_C)
+                tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C);
+    }
+
+    uchar16 zero = (uchar16)(0);
+    uchar16 full = (uchar16)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_C)
+            tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero;
+
+    F mom[10];
+    __local int m[10][128];
+    if(lidy < 128)
+    {
+        for(int i=0; i<10; i++)
+            m[i][lidy]=0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int lm[10] = {0};
+    int16 x0 = (int16)(0);
+    int16 x1 = (int16)(0);
+    int16 x2 = (int16)(0);
+    int16 x3 = (int16)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) )
+    {
+        int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15);
+        int16 p = convert_int16(tmp[xt/VLEN_C]);
+        int16 xp = v_xt * p, xxp = xp *v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp * v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf;
+    int py = lidy * ((int)x0.s0);
+    int sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((int)py) * sy;  // m03
+        m[8][lidy-bheight] = ((int)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((int)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((int)py) * sy;  // m03
+        lm[8] = ((int)x1.s0) * sy;  // m12
+        lm[7] = ((int)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = bheight; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+        F xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+
+__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    ushort tmp_coi[8]; // get the coi data
+    ushort8 tmp[32];
+    int VLEN_US = 8; // vector length of ushort
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/2, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE)
+            for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+            {
+                for(int j=0; j<VLEN_US; j++)
+                    tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_US)
+                tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US);
+    }
+
+    ushort8 zero = (ushort8)(0);
+    ushort8 full = (ushort8)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_US)
+            tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero;
+    F mom[10];
+    __local long m[10][128];
+    if(lidy < 128)
+        for(int i=0; i<10; i++)
+            m[i][lidy]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    long lm[10] = {0};
+    int8 x0 = (int8)(0);
+    int8 x1 = (int8)(0);
+    int8 x2 = (int8)(0);
+    long8 x3 = (long8)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) )
+    {
+        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
+        int8 p = convert_int8(tmp[xt/VLEN_US]);
+        int8 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += convert_long8(xxp) *convert_long8(v_xt);
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
+
+    int py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((long)py) * sy;  // m03
+        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((long)py) * sy;  // m03
+        lm[8] = ((long)x1.s0) * sy;  // m12
+        lm[7] = ((long)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(lidy == 0&&lidx == 0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        F xm = x  *mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+
+__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    short tmp_coi[8]; // get the coi data
+    short8 tmp[32];
+    int VLEN_S =8; // vector length of short
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/2, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+            {
+                for(int j=0; j<VLEN_S; j++)
+                    tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_S)
+                tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S);
+    }
+
+    short8 zero = (short8)(0);
+    short8 full = (short8)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=(VLEN_S))
+            tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero;
+
+    F mom[10];
+    __local long m[10][128];
+    if(lidy < 128)
+        for(int i=0; i<10; i++)
+            m[i][lidy]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    long lm[10] = {0};
+    int8 x0 = (int8)(0);
+    int8 x1 = (int8)(0);
+    int8 x2 = (int8)(0);
+    long8 x3 = (long8)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S))
+    {
+        int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7);
+        int8 p = convert_int8(tmp[xt/VLEN_S]);
+        int8 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += convert_long8(xxp) * convert_long8(v_xt);
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7;
+
+    int py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((long)py) * sy;  // m03
+        m[8][lidy-bheight] = ((long)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((long)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((long)py) * sy;  // m03
+        lm[8] = ((long)(x1.s0)) * sy;  // m12
+        lm[7] = ((long)(x2.s0)) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >=1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy ==0 &&lidx ==0)
+    {
+        for(int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        F xm = x * mom[0], ym = y*mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+
+__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step,
+                            __global F* dst_m,
+                            int dst_cols, int dst_step, int blocky,
+                            int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    float tmp_coi[4]; // get the coi data
+    float4 tmp[64] ;
+    int VLEN_F = 4; // vector length of float
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/4, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE, src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols -x);
+    int maxIdx = mul24(src_rows, src_cols);
+    int yOff = (y+lidy)*src_step;
+    int index;
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
+            {
+                for(int j=0; j<4; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+            }
+        else
+            for(int i=0; i < tileSize_width; i+=VLEN_F)
+                tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3));
+    }
+
+    float4 zero = (float4)(0);
+    float4 full = (float4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=4)
+            tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero;
+    F mom[10];
+    __local F m[10][128];
+    if(lidy < 128)
+        for(int i = 0; i < 10; i ++)
+            m[i][lidy] = 0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    F lm[10] = {0};
+    F4 x0 = (F4)(0);
+    F4 x1 = (F4)(0);
+    F4 x2 = (F4)(0);
+    F4 x3 = (F4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F )
+    {
+        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+        F4 p = convert_F4(tmp[xt/VLEN_F]);
+        F4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp * v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+
+    F py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((F)py) * sy;  // m03
+        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+
+    else if(lidy < bheight)
+    {
+        lm[9] = ((F)py) * sy;  // m03
+        lm[8] = ((F)x1.s0) * sy;  // m12
+        lm[7] = ((F)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        F xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
+
+__kernel void CvMoments_D6(__global F* src_data,  int src_rows, int src_cols, int src_step,
+                           __global F* dst_m,
+                           int dst_cols, int dst_step, int blocky,
+                           int depth, int cn, int coi, int binary, const int TILE_SIZE)
+{
+    F tmp_coi[4]; // get the coi data
+    F4 tmp[64];
+    int VLEN_D = 4; // length of vetor
+    int gidy = get_global_id(0);
+    int gidx = get_global_id(1);
+    int wgidy = get_group_id(0);
+    int wgidx = get_group_id(1);
+    int lidy = get_local_id(0);
+    int lidx = get_local_id(1);
+    int y = wgidy*TILE_SIZE;  // real Y index of pixel
+    int x = wgidx*TILE_SIZE;  // real X index of pixel
+    int kcn = (cn==2)?2:4;
+    int rstep = min(src_step/8, TILE_SIZE);
+    int tileSize_height = min(TILE_SIZE,  src_rows - y);
+    int tileSize_width = min(TILE_SIZE, src_cols - x);
+
+    if ( y+lidy < src_rows )
+    {
+        if(tileSize_width < TILE_SIZE)
+            for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ )
+                *((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0;
+        if( coi > 0 )
+            for(int i=0; i < tileSize_width; i+=VLEN_D)
+            {
+                for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++)
+                    tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1);
+                tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]);
+            }
+        else
+            for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D)
+                tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3));
+    }
+
+    F4 zero = (F4)(0);
+    F4 full = (F4)(255);
+    if( binary )
+        for(int i=0; i < tileSize_width; i+=VLEN_D)
+            tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero;
+    F mom[10];
+    __local F m[10][128];
+    if(lidy < 128)
+        for(int i=0; i<10; i++)
+            m[i][lidy]=0;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    F lm[10] = {0};
+    F4 x0 = (F4)(0);
+    F4 x1 = (F4)(0);
+    F4 x2 = (F4)(0);
+    F4 x3 = (F4)(0);
+    for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D )
+    {
+        F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3);
+        F4 p = tmp[xt/VLEN_D];
+        F4 xp = v_xt * p, xxp = xp * v_xt;
+        x0 += p;
+        x1 += xp;
+        x2 += xxp;
+        x3 += xxp *v_xt;
+    }
+    x0.s0 += x0.s1 + x0.s2 + x0.s3;
+    x1.s0 += x1.s1 + x1.s2 + x1.s3;
+    x2.s0 += x2.s1 + x2.s2 + x2.s3;
+    x3.s0 += x3.s1 + x3.s2 + x3.s3;
+
+    F py = lidy * x0.s0, sy = lidy*lidy;
+    int bheight = min(tileSize_height, TILE_SIZE/2);
+    if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height)
+    {
+        m[9][lidy-bheight] = ((F)py) * sy;  // m03
+        m[8][lidy-bheight] = ((F)x1.s0) * sy;  // m12
+        m[7][lidy-bheight] = ((F)x2.s0) * lidy;  // m21
+        m[6][lidy-bheight] = x3.s0;             // m30
+        m[5][lidy-bheight] = x0.s0 * sy;        // m02
+        m[4][lidy-bheight] = x1.s0 * lidy;         // m11
+        m[3][lidy-bheight] = x2.s0;             // m20
+        m[2][lidy-bheight] = py;             // m01
+        m[1][lidy-bheight] = x1.s0;             // m10
+        m[0][lidy-bheight] = x0.s0;             // m00
+    }
+    else if(lidy < bheight)
+    {
+        lm[9] = ((F)py) * sy;  // m03
+        lm[8] = ((F)x1.s0) * sy;  // m12
+        lm[7] = ((F)x2.s0) * lidy;  // m21
+        lm[6] = x3.s0;             // m30
+        lm[5] = x0.s0 * sy;        // m02
+        lm[4] = x1.s0 * lidy;         // m11
+        lm[3] = x2.s0;             // m20
+        lm[2] = py;             // m01
+        lm[1] = x1.s0;             // m10
+        lm[0] = x0.s0;             // m00
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for( int j = TILE_SIZE/2; j >= 1; j = j/2 )
+    {
+        if(lidy < j)
+            for( int i = 0; i < 10; i++ )
+                lm[i] = lm[i] + m[i][lidy];
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lidy >= j/2&&lidy < j)
+            for( int i = 0; i < 10; i++ )
+                m[i][lidy-j/2] = lm[i];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lidy == 0&&lidx == 0)
+    {
+        for( int mt = 0; mt < 10; mt++ )
+            mom[mt] = (F)lm[mt];
+        if(binary)
+        {
+            F s = 1./255;
+            for( int mt = 0; mt < 10; mt++ )
+                mom[mt] *= s;
+        }
+
+        F xm = x * mom[0], ym = y * mom[0];
+
+        // accumulate moments computed in each tile
+        dst_step /= sizeof(F);
+
+        // + m00 ( = m00' )
+        *(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0];
+
+        // + m10 ( = m10' + x*m00' )
+        *(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm;
+
+        // + m01 ( = m01' + y*m00' )
+        *(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym;
+
+        // + m20 ( = m20' + 2*x*m10' + x*x*m00' )
+        *(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm);
+
+        // + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' )
+        *(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1];
+
+        // + m02 ( = m02' + 2*y*m01' + y*y*m00' )
+        *(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym);
+
+        // + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' )
+        *(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm));
+
+        // + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20')
+        *(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3];
+
+        // + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02')
+        *(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5];
+
+        // + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' )
+        *(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym));
+    }
+}
--- a/modules/imgproc/src/opencl/morph.cl
+++ b/modules/imgproc/src/opencl/morph.cl
@ -0,0 +1,228 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Zero Lin, zero.lin@amd.com
+//    Yao Wang, bitwangyaoyao@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+
+#ifdef ERODE
+#define MORPH_OP(A,B) min((A),(B))
+#endif
+#ifdef DILATE
+#define MORPH_OP(A,B) max((A),(B))
+#endif
+//BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii
+#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2)
+#ifndef GENTYPE
+
+__kernel void morph_C1_D0(__global const uchar * restrict src,
+                          __global uchar *dst,
+                          int src_offset_x, int src_offset_y,
+                          int cols, int rows,
+                          int src_step_in_pixel, int dst_step_in_pixel,
+                          __constant uchar * mat_kernel,
+                          int src_whole_cols, int src_whole_rows,
+                          int dst_offset_in_pixel)
+{
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int x = get_group_id(0)*4*LSIZE0;
+    int y = get_group_id(1)*LSIZE1;
+    int start_x = x+src_offset_x-RADIUSX & 0xfffffffc;
+    int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc;
+    int width = (end_x -start_x+4)>>2;
+    int offset = src_offset_x-RADIUSX & 3;
+    int start_y = y+src_offset_y-RADIUSY;
+    int point1 = mad24(l_y,LSIZE0,l_x);
+    int point2 = point1 + LSIZE0*LSIZE1;
+    int tl_x = (point1 % width)<<2;
+    int tl_y = point1 / width;
+    int tl_x2 = (point2 % width)<<2;
+    int tl_y2 = point2 / width;
+    int cur_x = start_x + tl_x;
+    int cur_y = start_y + tl_y;
+    int cur_x2 = start_x + tl_x2;
+    int cur_y2 = start_y + tl_y2;
+    int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
+    int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
+    uchar4 temp0,temp1;
+    __local uchar4 LDS_DAT[2*LSIZE1*LSIZE0];
+
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
+    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
+    temp0 = *(__global uchar4*)&src[start_addr];
+    temp1 = *(__global uchar4*)&src[start_addr2];
+    //judge if read out of boundary
+    temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x);
+    temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y);
+    temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z);
+    temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w);
+    temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0);
+
+    temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x);
+    temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y);
+    temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z);
+    temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w);
+    temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1);
+
+    LDS_DAT[point1] = temp0;
+    LDS_DAT[point2] = temp1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    uchar4 res = (uchar4)VAL;
+
+    for(int i=0; i<2*RADIUSY+1; i++)
+        for(int j=0; j<2*RADIUSX+1; j++)
+        {
+            res =
+#ifndef RECTKERNEL
+                mat_kernel[i*(2*RADIUSX+1)+j] ?
+#endif
+                MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j))
+#ifndef RECTKERNEL
+                :res
+#endif
+                ;
+        }
+
+    int gidx = get_global_id(0)<<2;
+    int gidy = get_global_id(1);
+    int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
+
+    if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0))
+    {
+        *(__global uchar4*)&dst[out_addr] = res;
+    }
+    else
+    {
+        if(gidx+3<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+            dst[out_addr+1] = res.y;
+            dst[out_addr+2] = res.z;
+            dst[out_addr+3] = res.w;
+        }
+        else if(gidx+2<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+            dst[out_addr+1] = res.y;
+            dst[out_addr+2] = res.z;
+        }
+        else if(gidx+1<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+            dst[out_addr+1] = res.y;
+        }
+        else if(gidx<cols && gidy<rows)
+        {
+            dst[out_addr] = res.x;
+        }
+    }
+}
+
+#else
+
+__kernel void morph(__global const GENTYPE * restrict src,
+                    __global GENTYPE *dst,
+                    int src_offset_x, int src_offset_y,
+                    int cols, int rows,
+                    int src_step_in_pixel, int dst_step_in_pixel,
+                    __constant uchar * mat_kernel,
+                    int src_whole_cols, int src_whole_rows,
+                    int dst_offset_in_pixel)
+{
+    int l_x = get_local_id(0);
+    int l_y = get_local_id(1);
+    int x = get_group_id(0)*LSIZE0;
+    int y = get_group_id(1)*LSIZE1;
+    int start_x = x+src_offset_x-RADIUSX;
+    int end_x = x + src_offset_x+LSIZE0+RADIUSX;
+    int width = end_x -(x+src_offset_x-RADIUSX)+1;
+    int start_y = y+src_offset_y-RADIUSY;
+    int point1 = mad24(l_y,LSIZE0,l_x);
+    int point2 = point1 + LSIZE0*LSIZE1;
+    int tl_x = point1 % width;
+    int tl_y = point1 / width;
+    int tl_x2 = point2 % width;
+    int tl_y2 = point2 / width;
+    int cur_x = start_x + tl_x;
+    int cur_y = start_y + tl_y;
+    int cur_x2 = start_x + tl_x2;
+    int cur_y2 = start_y + tl_y2;
+    int start_addr = mad24(cur_y,src_step_in_pixel,cur_x);
+    int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2);
+    GENTYPE temp0,temp1;
+    __local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0];
+
+    int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols);
+    //read pixels from src
+    start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0;
+    start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0;
+    temp0 = src[start_addr];
+    temp1 = src[start_addr2];
+    //judge if read out of boundary
+    temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0);
+    temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0);
+
+    temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1);
+    temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1);
+
+    LDS_DAT[point1] = temp0;
+    LDS_DAT[point2] = temp1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    GENTYPE res = (GENTYPE)VAL;
+    for(int i=0; i<2*RADIUSY+1; i++)
+        for(int j=0; j<2*RADIUSX+1; j++)
+        {
+            res =
+#ifndef RECTKERNEL
+                mat_kernel[i*(2*RADIUSX+1)+j] ?
+#endif
+                MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)])
+#ifndef RECTKERNEL
+                :res
+#endif
+                ;
+        }
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel);
+    if(gidx<cols && gidy<rows)
+    {
+        dst[out_addr] = res;
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/pyramid.cl
+++ b/modules/imgproc/src/opencl/pyramid.cl
--- a/modules/imgproc/src/opencl/remap.cl
+++ b/modules/imgproc/src/opencl/remap.cl
@ -0,0 +1,323 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wu Zailong, bullet@yeah.net
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+#endif
+
+#ifdef INTER_NEAREST
+#define convertToWT
+#endif
+
+#ifdef BORDER_CONSTANT
+#define EXTRAPOLATE(v2, v) v = scalar;
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(v2, v) \
+    { \
+        v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \
+        v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
+    }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(v2, v) \
+    { \
+        if (v2.x < 0) \
+            v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \
+        if (v2.x >= src_cols) \
+            v2.x %= src_cols; \
+        \
+        if (v2.y < 0) \
+            v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \
+        if( v2.y >= src_rows ) \
+            v2.y %= src_rows; \
+        v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
+    }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101)
+#ifdef BORDER_REFLECT
+#define DELTA int delta = 0
+#else
+#define DELTA int delta = 1
+#endif
+#define EXTRAPOLATE(v2, v) \
+    { \
+        DELTA; \
+        if (src_cols == 1) \
+            v2.x = 0; \
+        else \
+            do \
+            { \
+                if( v2.x < 0 ) \
+                    v2.x = -v2.x - 1 + delta; \
+                else \
+                    v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \
+            } \
+            while (v2.x >= src_cols || v2.x < 0); \
+        \
+        if (src_rows == 1) \
+            v2.y = 0; \
+        else \
+            do \
+            { \
+                if( v2.y < 0 ) \
+                    v2.y = -v2.y - 1 + delta; \
+                else \
+                    v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \
+            } \
+            while (v2.y >= src_rows || v2.y < 0); \
+        v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \
+    }
+#else
+#error No extrapolation method
+#endif
+
+#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0)
+
+#ifdef INTER_NEAREST
+
+__kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst,
+        __global float * map1, __global float * map2,
+        int src_offset, int dst_offset, int map1_offset, int map2_offset,
+        int src_step, int dst_step, int map1_step, int map2_step,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int dstIdx = mad24(y, dst_step, x + dst_offset);
+        int map1Idx = mad24(y, map1_step, x + map1_offset);
+        int map2Idx = mad24(y, map2_step, x + map2_offset);
+
+        int gx = convert_int_sat_rte(map1[map1Idx]);
+        int gy = convert_int_sat_rte(map2[map2Idx]);
+
+        if (NEED_EXTRAPOLATION(gx, gy))
+        {
+            int2 gxy = (int2)(gx, gy), zero = (int2)(0);
+            EXTRAPOLATE(gxy, dst[dstIdx]);
+        }
+        else
+        {
+            int srcIdx = mad24(gy, src_step, gx + src_offset);
+            dst[dstIdx] = src[srcIdx];
+        }
+    }
+}
+
+__kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __global float2 * map1,
+        int src_offset, int dst_offset, int map1_offset,
+        int src_step, int dst_step, int map1_step,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int dstIdx = mad24(y, dst_step, x + dst_offset);
+        int map1Idx = mad24(y, map1_step, x + map1_offset);
+
+        int2 gxy = convert_int2_sat_rte(map1[map1Idx]);
+        int gx = gxy.x, gy = gxy.y;
+
+        if (NEED_EXTRAPOLATION(gx, gy))
+        {
+            int2 zero = (int2)(0);
+            EXTRAPOLATE(gxy, dst[dstIdx]);
+        }
+        else
+        {
+            int srcIdx = mad24(gy, src_step, gx + src_offset);
+            dst[dstIdx] = src[srcIdx];
+        }
+    }
+}
+
+__kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __global short2 * map1,
+        int src_offset, int dst_offset, int map1_offset,
+        int src_step, int dst_step, int map1_step,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int dstIdx = mad24(y, dst_step, x + dst_offset);
+        int map1Idx = mad24(y, map1_step, x + map1_offset);
+
+        int2 gxy = convert_int2(map1[map1Idx]);
+        int gx = gxy.x, gy = gxy.y;
+
+        if (NEED_EXTRAPOLATION(gx, gy))
+        {
+            int2 zero = (int2)(0);
+            EXTRAPOLATE(gxy, dst[dstIdx]);
+        }
+        else
+        {
+            int srcIdx = mad24(gy, src_step, gx + src_offset);
+            dst[dstIdx] = src[srcIdx];
+        }
+    }
+}
+
+#elif INTER_LINEAR
+
+__kernel void remap_2_32FC1(__global T const * restrict  src, __global T * dst,
+        __global float * map1, __global float * map2,
+        int src_offset, int dst_offset, int map1_offset, int map2_offset,
+        int src_step, int dst_step, int map1_step, int map2_step,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int dstIdx = mad24(y, dst_step, x + dst_offset);
+        int map1Idx = mad24(y, map1_step, x + map1_offset);
+        int map2Idx = mad24(y, map2_step, x + map2_offset);
+
+        float2 map_data = (float2)(map1[map1Idx], map2[map2Idx]);
+
+        int2 map_dataA = convert_int2_sat_rtn(map_data);
+        int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
+        int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
+        int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1);
+        int2 zero = (int2)(0);
+
+        float2 _u = map_data - convert_float2(map_dataA);
+        WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
+        WT scalar = convertToWT(nVal);
+        WT a = scalar, b = scalar, c = scalar, d = scalar;
+
+        if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
+            a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataA, a);
+
+        if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
+            b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataB, b);
+
+        if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
+            c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataC, c);
+
+        if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
+            d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataD, d);
+
+        WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) +
+                      b * (WT)(u.x)     * (WT)(1 - u.y) +
+                      c * (WT)(1 - u.x) * (WT)(u.y) +
+                      d * (WT)(u.x)     * (WT)(u.y);
+        dst[dstIdx] = convertToT(dst_data);
+    }
+}
+
+__kernel void remap_32FC2(__global T const * restrict  src, __global T * dst,
+        __global float2 * map1,
+        int src_offset, int dst_offset, int map1_offset,
+        int src_step, int dst_step, int map1_step,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < dst_cols && y < dst_rows)
+    {
+        int dstIdx = mad24(y, dst_step, x + dst_offset);
+        int map1Idx = mad24(y, map1_step, x + map1_offset);
+
+        float2 map_data = map1[map1Idx];
+        int2 map_dataA = convert_int2_sat_rtn(map_data);
+        int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y);
+        int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1);
+        int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1);
+        int2 zero = (int2)(0);
+
+        float2 _u = map_data - convert_float2(map_dataA);
+        WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32;
+        WT scalar = convertToWT(nVal);
+        WT a = scalar, b = scalar, c = scalar, d = scalar;
+
+        if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y))
+            a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataA, a);
+
+        if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y))
+            b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataB, b);
+
+        if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y))
+            c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataC, c);
+
+        if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y))
+            d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]);
+        else
+            EXTRAPOLATE(map_dataD, d);
+
+        WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) +
+                      b * (WT)(u.x)     * (WT)(1 - u.y) +
+                      c * (WT)(1 - u.x) * (WT)(u.y) +
+                      d * (WT)(u.x)     * (WT)(u.y);
+        dst[dstIdx] = convertToT(dst_data);
+    }
+}
+
+#endif
--- a/modules/imgproc/src/opencl/resize.cl
+++ b/modules/imgproc/src/opencl/resize.cl
@ -0,0 +1,152 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//	  Niko Li, newlife20080214@gmail.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+// resize kernel
+// Currently, CV_8UC1  CV_8UC4  CV_32FC1 and CV_32FC4are supported.
+// We shall support other types later if necessary.
+
+#if defined DOUBLE_SUPPORT
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#define F double
+#else
+#define F float
+#endif
+
+#define INTER_RESIZE_COEF_BITS 11
+#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
+#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
+#define CAST_SCALE (1.0f/(1<<CAST_BITS))
+#define INC(x,l) min(x+1,l-1)
+
+#define PIXSIZE ((int)sizeof(PIXTYPE))
+#define noconvert(x) (x)
+
+#if defined INTER_LINEAR
+
+__kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset,
+                       int srcrows, int srccols,
+                       __global uchar* dstptr, int dststep, int dstoffset,
+                       int dstrows, int dstcols,
+                       float ifx, float ify)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
+    int x = floor(sx), y = floor(sy);
+
+    float u = sx - x, v = sy - y;
+
+    if ( x<0 ) x=0,u=0;
+    if ( x>=srccols ) x=srccols-1,u=0;
+    if ( y<0 ) y=0,v=0;
+    if ( y>=srcrows ) y=srcrows-1,v=0;
+
+    int y_ = INC(y,srcrows);
+    int x_ = INC(x,srccols);
+    const PIXTYPE* src = (const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE));
+
+#if depth == 0
+    u = u * INTER_RESIZE_COEF_SCALE;
+    v = v * INTER_RESIZE_COEF_SCALE;
+
+    int U = rint(u);
+    int V = rint(v);
+    int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
+    int V1 = rint(INTER_RESIZE_COEF_SCALE - v);
+
+    WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
+    WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+    WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) +
+               mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3);
+
+    PIXTYPE uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
+#else
+    float u1 = 1.f-u;
+    float v1 = 1.f-v;
+    WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE)));
+    WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE)));
+    WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE)));
+    PIXTYPE uval = u1 * v1 * s_data1 + u * v1 * s_data2 + u1 * v *s_data3 + u * v *s_data4;
+#endif
+
+    if(dx < dstcols && dy < dstrows)
+    {
+        PIXTYPE* dst = (PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE));
+        dst[0] = uval;
+    }
+}
+
+#elif defined INTER_NEAREST
+
+__kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset,
+                       int srcrows, int srccols,
+                       __global uchar* dstptr, int dststep, int dstoffset,
+                       int dstrows, int dstcols,
+                       float ifx, float ify)
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < dstcols && dy < dstrows )
+    {
+        F s1 = dx*ifx;
+        F s2 = dy*ify;
+        int sx = min(convert_int_rtz(s1), srccols-1);
+        int sy = min(convert_int_rtz(s2), srcrows-1);
+        PIXTYPE* dst = (PIXTYPE*)(dstptr +
+            mad24(dy, dststep, dstoffset + dx*PIXSIZE));
+        const PIXTYPE* src = (const PIXTYPE*)(srcptr +
+            mad24(sy, srcstep, srcoffset + sx*PIXSIZE));
+        dst[0] = src[0];
+    }
+}
+
+#endif
+
--- a/modules/imgproc/src/opencl/threshold.cl
+++ b/modules/imgproc/src/opencl/threshold.cl
@ -0,0 +1,152 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined (DOUBLE_SUPPORT)
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#endif
+
+// threshold type:
+// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3,
+//       THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 };
+
+__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst,
+                              int src_offset, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              uchar thresh, uchar max_val, int thresh_type
+                              )
+{
+    int gx = get_global_id(0);
+    const int gy = get_global_id(1);
+
+    int offset = (dst_offset & 15);
+    src_offset -= offset;
+
+    int dstart = (gx << 4) - offset;
+    if(dstart < dst_cols && gy < dst_rows)
+    {
+        uchar16 sdata = vload16(gx, src+src_offset+gy*src_step);
+        uchar16 ddata;
+        uchar16 zero = 0;
+        switch (thresh_type)
+        {
+            case 0:
+                ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0);
+                break;
+            case 1:
+                ddata = ((sdata > thresh)) ? zero  : (uchar16)(max_val);
+                break;
+            case 2:
+                ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata;
+                break;
+            case 3:
+                ddata = ((sdata > thresh)) ? sdata : zero;
+                break;
+            case 4:
+                ddata = ((sdata > thresh)) ? zero : sdata;
+                break;
+            default:
+                ddata = sdata;
+        }
+        int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8,
+                             dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15);
+        uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart);
+        int16 con = dpos >= 0 && dpos < dst_cols;
+        ddata = convert_uchar16(con != 0) ? ddata : dVal;
+        if(dstart < dst_cols)
+        {
+            *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
+        }
+    }
+}
+
+
+__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst,
+                              int src_offset, int src_step,
+                              int dst_offset, int dst_rows, int dst_cols, int dst_step,
+                              float thresh, float max_val, int thresh_type
+                              )
+{
+    const int gx = get_global_id(0);
+    const int gy = get_global_id(1);
+
+    int offset = (dst_offset & 3);
+    src_offset -= offset;
+
+    int dstart = (gx << 2) - offset;
+    if(dstart < dst_cols && gy < dst_rows)
+    {
+        float4 sdata = vload4(gx, src+src_offset+gy*src_step);
+        float4 ddata;
+        float4 zero = 0;
+        switch (thresh_type)
+        {
+            case 0:
+                ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f);
+                break;
+            case 1:
+                ddata = sdata > thresh ? zero : (float4)max_val;
+                break;
+            case 2:
+                ddata = sdata > thresh ? (float4)thresh : sdata;
+                break;
+            case 3:
+                ddata = sdata > thresh ? sdata : (float4)(0.f);
+                break;
+            case 4:
+                ddata = sdata > thresh ? (float4)(0.f) : sdata;
+                break;
+            default:
+                ddata = sdata;
+        }
+        int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3);
+        float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart);
+        int4 con = dpos >= 0 && dpos < dst_cols;
+        ddata = convert_float4(con) != (float4)(0) ? ddata : dVal;
+        if(dstart < dst_cols)
+        {
+            *(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata;
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/warpaffine.cl
+++ b/modules/imgproc/src/opencl/warpaffine.cl
@ -0,0 +1,761 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+//warpAffine kernel
+//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+typedef double F;
+typedef double4 F4;
+#define convert_F4 convert_double4
+#else
+typedef float F;
+typedef float4 F4;
+#define convert_F4 convert_float4
+#endif
+
+#define INTER_BITS 5
+#define INTER_TAB_SIZE (1 << INTER_BITS)
+#define INTER_SCALE 1.f/INTER_TAB_SIZE
+#define AB_BITS max(10, (int)INTER_BITS)
+#define AB_SCALE (1 << AB_BITS)
+#define INTER_REMAP_COEF_BITS 15
+#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
+
+inline void interpolateCubic( float x, float* coeffs )
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
+    coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
+    coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+
+/**********************************************8UC1*********************************************
+***********************************************************************************************/
+__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
+                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        dx = (dx<<2) - (dst_offset&3);
+
+        int round_delta = (AB_SCALE>>1);
+
+        int4 X, Y;
+        int4 sx, sy;
+        int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
+        DX = (DX << AB_BITS);
+        F4 M0DX, M3DX;
+        M0DX = M[0] * convert_F4(DX);
+        M3DX = M[3] * convert_F4(DX);
+        X = convert_int4(rint(M0DX));
+        Y = convert_int4(rint(M3DX));
+        int tmp1, tmp2;
+        tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
+        tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
+
+        X += tmp1 + round_delta;
+        Y += tmp2 + round_delta;
+
+        sx = convert_int4(convert_short4(X >> AB_BITS));
+        sy = convert_int4(convert_short4(Y >> AB_BITS));
+
+        __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
+        uchar4 dval = *d;
+        DX = (int4)(dx, dx+1, dx+2, dx+3);
+        int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
+        int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
+        int4 spos = src_offset + sy * srcStep + sx;
+        uchar4 sval;
+        sval.s0 = scon.s0 ? src[spos.s0] : 0;
+        sval.s1 = scon.s1 ? src[spos.s1] : 0;
+        sval.s2 = scon.s2 ? src[spos.s2] : 0;
+        sval.s3 = scon.s3 ? src[spos.s3] : 0;
+        dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
+        *d = dval;
+    }
+}
+
+__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows,
+                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        dx = (dx<<2) - (dst_offset&3);
+
+        int round_delta = ((AB_SCALE >> INTER_BITS) >> 1);
+
+        int4 X, Y;
+        short4  ax, ay;
+        int4 sx, sy;
+        int4 DX = (int4)(dx, dx+1, dx+2, dx+3);
+        DX = (DX << AB_BITS);
+        F4 M0DX, M3DX;
+        M0DX = M[0] * convert_F4(DX);
+        M3DX = M[3] * convert_F4(DX);
+        X = convert_int4(rint(M0DX));
+        Y = convert_int4(rint(M3DX));
+
+        int tmp1, tmp2;
+        tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE);
+        tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE);
+
+        X += tmp1 + round_delta;
+        Y += tmp2 + round_delta;
+
+        X = X >> (AB_BITS - INTER_BITS);
+        Y = Y >> (AB_BITS - INTER_BITS);
+
+        sx = convert_int4(convert_short4(X >> INTER_BITS));
+        sy = convert_int4(convert_short4(Y >> INTER_BITS));
+        ax = convert_short4(X & (INTER_TAB_SIZE-1));
+        ay = convert_short4(Y & (INTER_TAB_SIZE-1));
+
+        uchar4 v0, v1, v2,v3;
+        int4 scon0, scon1, scon2, scon3;
+        int4 spos0, spos1, spos2, spos3;
+
+        scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows);
+        scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows);
+        scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows);
+        scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows);
+        spos0 = src_offset + sy * srcStep + sx;
+        spos1 = src_offset + sy * srcStep + sx + 1;
+        spos2 = src_offset + (sy+1) * srcStep + sx;
+        spos3 = src_offset + (sy+1) * srcStep + sx + 1;
+
+        v0.s0 = scon0.s0 ? src[spos0.s0] : 0;
+        v1.s0 = scon1.s0 ? src[spos1.s0] : 0;
+        v2.s0 = scon2.s0 ? src[spos2.s0] : 0;
+        v3.s0 = scon3.s0 ? src[spos3.s0] : 0;
+
+        v0.s1 = scon0.s1 ? src[spos0.s1] : 0;
+        v1.s1 = scon1.s1 ? src[spos1.s1] : 0;
+        v2.s1 = scon2.s1 ? src[spos2.s1] : 0;
+        v3.s1 = scon3.s1 ? src[spos3.s1] : 0;
+
+        v0.s2 = scon0.s2 ? src[spos0.s2] : 0;
+        v1.s2 = scon1.s2 ? src[spos1.s2] : 0;
+        v2.s2 = scon2.s2 ? src[spos2.s2] : 0;
+        v3.s2 = scon3.s2 ? src[spos3.s2] : 0;
+
+        v0.s3 = scon0.s3 ? src[spos0.s3] : 0;
+        v1.s3 = scon1.s3 ? src[spos1.s3] : 0;
+        v2.s3 = scon2.s3 ? src[spos2.s3] : 0;
+        v3.s3 = scon3.s3 ? src[spos3.s3] : 0;
+
+        short4 itab0, itab1, itab2, itab3;
+        float4 taby, tabx;
+        taby = INTER_SCALE * convert_float4(ay);
+        tabx = INTER_SCALE * convert_float4(ax);
+
+        itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
+        itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE ));
+        itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE ));
+        itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE ));
+
+
+        int4 val;
+        uchar4 tval;
+        val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1)
+              + convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3);
+        tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+
+        __global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
+        uchar4 dval = *d;
+        DX = (int4)(dx, dx+1, dx+2, dx+3);
+        int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows;
+        dval = convert_uchar4(dcon != 0) ? tval : dval;
+        *d = dval;
+    }
+}
+
+__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
+                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+        int X = X0 >> (AB_BITS - INTER_BITS);
+        int Y = Y0 >> (AB_BITS - INTER_BITS);
+
+        short sx = (short)(X >> INTER_BITS) - 1;
+        short sy = (short)(Y >> INTER_BITS) - 1;
+        short ay = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+        uchar v[16];
+        int i, j;
+
+#pragma unroll 4
+        for(i=0; i<4;  i++)
+            for(j=0; j<4;  j++)
+            {
+                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0;
+            }
+
+        short itab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = 1.f/INTER_TAB_SIZE * ay;
+        axx = 1.f/INTER_TAB_SIZE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+        int isum = 0;
+
+#pragma unroll 16
+        for( i=0; i<16; i++ )
+        {
+            F v = tab1y[(i>>2)] * tab1x[(i&3)];
+            isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
+        }
+
+        if( isum != INTER_REMAP_COEF_SCALE )
+        {
+            int k1, k2;
+            int diff = isum - INTER_REMAP_COEF_SCALE;
+            int Mk1=2, Mk2=2, mk1=2, mk2=2;
+            for( k1 = 2; k1 < 4; k1++ )
+                for( k2 = 2; k2 < 4; k2++ )
+                {
+                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                        mk1 = k1, mk2 = k2;
+                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                        Mk1 = k1, Mk2 = k2;
+                }
+            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+        }
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            int sum=0;
+            for ( i =0; i<16; i++ )
+            {
+                sum += v[i] * itab[i] ;
+            }
+            dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+        }
+    }
+}
+
+/**********************************************8UC4*********************************************
+***********************************************************************************************/
+
+__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
+                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = (AB_SCALE >> 1);
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+
+        int sx0 = (short)(X0 >> AB_BITS);
+        int sy0 = (short)(Y0 >> AB_BITS);
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0;
+    }
+}
+
+__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
+                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+
+        src_offset = (src_offset>>2);
+        srcStep = (srcStep>>2);
+
+        int tmp = (dx << AB_BITS);
+        int X0 = rint(M[0] * tmp);
+        int Y0 = rint(M[3] * tmp);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+        X0 = X0 >> (AB_BITS - INTER_BITS);
+        Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+        short sx0 = (short)(X0 >> INTER_BITS);
+        short sy0 = (short)(Y0 >> INTER_BITS);
+        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
+        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
+
+        int4 v0, v1, v2, v3;
+
+        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0;
+        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0;
+        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0;
+        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0;
+
+        int itab0, itab1, itab2, itab3;
+        float taby, tabx;
+        taby = 1.f/INTER_TAB_SIZE*ay0;
+        tabx = 1.f/INTER_TAB_SIZE*ax0;
+
+        itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+        itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
+        itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+        itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
+
+        int4 val;
+        val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    }
+}
+
+__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows,
+                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = ((AB_SCALE>>INTER_BITS)>>1);
+
+        src_offset = (src_offset>>2);
+        srcStep = (srcStep>>2);
+        dst_offset = (dst_offset>>2);
+        dstStep = (dstStep>>2);
+
+        int tmp = (dx << AB_BITS);
+        int X0 = rint(M[0] * tmp);
+        int Y0 = rint(M[3] * tmp);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+        X0 = X0 >> (AB_BITS - INTER_BITS);
+        Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+        int sx = (short)(X0 >> INTER_BITS) - 1;
+        int sy = (short)(Y0 >> INTER_BITS) - 1;
+        int ay = (short)(Y0 & (INTER_TAB_SIZE-1));
+        int ax = (short)(X0 & (INTER_TAB_SIZE-1));
+
+        uchar4 v[16];
+        int i,j;
+#pragma unroll 4
+        for(i=0; i<4; i++)
+            for(j=0; j<4; j++)
+            {
+                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)])  : (uchar4)0;
+            }
+        int itab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = INTER_SCALE * ay;
+        axx = INTER_SCALE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+        int isum = 0;
+
+#pragma unroll 16
+        for( i=0; i<16; i++ )
+        {
+            float tmp;
+            tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
+            itab[i] = rint(tmp);
+            isum += itab[i];
+        }
+
+        if( isum != INTER_REMAP_COEF_SCALE )
+        {
+            int k1, k2;
+            int diff = isum - INTER_REMAP_COEF_SCALE;
+            int Mk1=2, Mk2=2, mk1=2, mk2=2;
+
+            for( k1 = 2; k1 < 4; k1++ )
+                for( k2 = 2; k2 < 4; k2++ )
+                {
+
+                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                        mk1 = k1, mk2 = k2;
+                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                        Mk1 = k1, Mk2 = k2;
+                }
+
+            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+        }
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            int4 sum=0;
+            for ( i =0; i<16; i++ )
+            {
+                sum += convert_int4(v[i]) * itab[i];
+            }
+            dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+        }
+    }
+}
+
+
+/**********************************************32FC1********************************************
+***********************************************************************************************/
+
+__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = AB_SCALE/2;
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+
+        short sx0 = (short)(X0 >> AB_BITS);
+        short sy0 = (short)(Y0 >> AB_BITS);
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0;
+    }
+}
+
+__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+
+        src_offset = (src_offset>>2);
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+        X0 = X0 >> (AB_BITS - INTER_BITS);
+        Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+        short sx0 = (short)(X0 >> INTER_BITS);
+        short sy0 = (short)(Y0 >> INTER_BITS);
+        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
+        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
+
+        float v0, v1, v2, v3;
+
+        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0;
+        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0;
+        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0;
+        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0;
+
+        float tab[4];
+        float taby[2], tabx[2];
+        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
+        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
+
+        tab[0] = taby[0] * tabx[0];
+        tab[1] = taby[0] * tabx[1];
+        tab[2] = taby[1] * tabx[0];
+        tab[3] = taby[1] * tabx[1];
+
+        float sum = 0;
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
+    }
+}
+
+__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+
+        src_offset = (src_offset>>2);
+        dst_offset = (dst_offset>>2);
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+        X0 = X0 >> (AB_BITS - INTER_BITS);
+        Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+        short sx = (short)(X0 >> INTER_BITS) - 1;
+        short sy = (short)(Y0 >> INTER_BITS) - 1;
+        short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
+        short ax = (short)(X0 & (INTER_TAB_SIZE-1));
+
+        float v[16];
+        int i;
+
+        for(i=0; i<16;  i++)
+            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0;
+
+        float tab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = 1.f/INTER_TAB_SIZE * ay;
+        axx = 1.f/INTER_TAB_SIZE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+        for( i=0; i<16; i++ )
+        {
+            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+        }
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            float sum = 0;
+#pragma unroll 4
+            for ( i =0; i<16; i++ )
+            {
+                sum += v[i] * tab[i];
+            }
+            dst[dst_offset+dy*dstStep+dx] = sum;
+
+        }
+    }
+}
+
+
+/**********************************************32FC4********************************************
+***********************************************************************************************/
+
+__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                                 int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                 int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = AB_SCALE/2;
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+
+        short sx0 = (short)(X0 >> AB_BITS);
+        short sy0 = (short)(Y0 >> AB_BITS);
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0;
+    }
+}
+
+__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                                     int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                     int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+
+        src_offset = (src_offset>>4);
+        dst_offset = (dst_offset>>4);
+        srcStep = (srcStep>>2);
+        dstStep = (dstStep>>2);
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+        X0 = X0 >> (AB_BITS - INTER_BITS);
+        Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+        short sx0 = (short)(X0 >> INTER_BITS);
+        short sy0 = (short)(Y0 >> INTER_BITS);
+        short ax0 = (short)(X0 & (INTER_TAB_SIZE-1));
+        short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1));
+
+        float4 v0, v1, v2, v3;
+
+        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
+        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
+        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
+        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
+
+        float tab[4];
+        float taby[2], tabx[2];
+        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
+        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
+
+        tab[0] = taby[0] * tabx[0];
+        tab[1] = taby[0] * tabx[1];
+        tab[2] = taby[1] * tabx[0];
+        tab[3] = taby[1] * tabx[1];
+
+        float4 sum = 0;
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[dst_offset+dy*dstStep+dx] = sum;
+    }
+}
+
+__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                                    int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                    int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        int round_delta = AB_SCALE/INTER_TAB_SIZE/2;
+
+        src_offset = (src_offset>>4);
+        dst_offset = (dst_offset>>4);
+        srcStep = (srcStep>>2);
+        dstStep = (dstStep>>2);
+
+        int X0 = rint(M[0] * dx * AB_SCALE);
+        int Y0 = rint(M[3] * dx * AB_SCALE);
+        X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta;
+        Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta;
+        X0 = X0 >> (AB_BITS - INTER_BITS);
+        Y0 = Y0 >> (AB_BITS - INTER_BITS);
+
+        short sx = (short)(X0 >> INTER_BITS) - 1;
+        short sy = (short)(Y0 >> INTER_BITS) - 1;
+        short ay = (short)(Y0 & (INTER_TAB_SIZE-1));
+        short ax = (short)(X0 & (INTER_TAB_SIZE-1));
+
+        float4 v[16];
+        int i;
+
+        for(i=0; i<16;  i++)
+            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
+
+        float tab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = 1.f/INTER_TAB_SIZE * ay;
+        axx = 1.f/INTER_TAB_SIZE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+        for( i=0; i<16; i++ )
+        {
+            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+        }
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            float4 sum = 0;
+#pragma unroll 4
+            for ( i =0; i<16; i++ )
+            {
+                sum += v[i] * tab[i];
+            }
+            dst[dst_offset+dy*dstStep+dx] = sum;
+
+        }
+    }
+}
--- a/modules/imgproc/src/opencl/warpperspective.cl
+++ b/modules/imgproc/src/opencl/warpperspective.cl
@ -0,0 +1,688 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Zhang Ying, zhangying913@gmail.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+//wrapPerspective kernel
+//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic.
+
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
+typedef double F;
+typedef double4 F4;
+#define convert_F4 convert_double4
+#else
+typedef float F;
+typedef float4 F4;
+#define convert_F4 convert_float4
+#endif
+
+
+#define INTER_BITS 5
+#define INTER_TAB_SIZE (1 << INTER_BITS)
+#define INTER_SCALE 1.f/INTER_TAB_SIZE
+#define AB_BITS max(10, (int)INTER_BITS)
+#define AB_SCALE (1 << AB_BITS)
+#define INTER_REMAP_COEF_BITS 15
+#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
+
+inline void interpolateCubic( float x, float* coeffs )
+{
+    const float A = -0.75f;
+
+    coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A;
+    coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f;
+    coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f;
+    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
+}
+
+
+/**********************************************8UC1*********************************************
+***********************************************************************************************/
+__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows,
+                                      int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                      int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        dx = (dx<<2) - (dst_offset&3);
+
+        F4 DX = (F4)(dx, dx+1, dx+2, dx+3);
+        F4 X0 = M[0]*DX + M[1]*dy + M[2];
+        F4 Y0 = M[3]*DX + M[4]*dy + M[5];
+        F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0;
+        W = (W!=zero) ? one/W : zero;
+        short4 X = convert_short4(rint(X0*W));
+        short4 Y = convert_short4(rint(Y0*W));
+        int4 sx = convert_int4(X);
+        int4 sy = convert_int4(Y);
+
+        int4 DXD = (int4)(dx, dx+1, dx+2, dx+3);
+        __global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx);
+        uchar4 dval = *d;
+        int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows;
+        int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows;
+        int4 spos = src_offset + sy * srcStep + sx;
+        uchar4 sval;
+        sval.s0 = scon.s0 ? src[spos.s0] : 0;
+        sval.s1 = scon.s1 ? src[spos.s1] : 0;
+        sval.s2 = scon.s2 ? src[spos.s2] : 0;
+        sval.s3 = scon.s3 ? src[spos.s3] : 0;
+        dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval;
+        *d = dval;
+    }
+}
+
+__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        int sx = (short)(X >> INTER_BITS);
+        int sy = (short)(Y >> INTER_BITS);
+        int ay = (short)(Y & (INTER_TAB_SIZE-1));
+        int ax = (short)(X & (INTER_TAB_SIZE-1));
+
+        uchar v[4];
+        int i;
+#pragma unroll 4
+        for(i=0; i<4;  i++)
+            v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0;
+
+        short itab[4];
+        float tab1y[2], tab1x[2];
+        tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
+        tab1y[1] = 1.f/INTER_TAB_SIZE*ay;
+        tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
+        tab1x[1] = 1.f/INTER_TAB_SIZE*ax;
+
+#pragma unroll 4
+        for(i=0; i<4;  i++)
+        {
+            float v = tab1y[(i>>1)] * tab1x[(i&1)];
+            itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE ));
+        }
+        if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            int sum = 0;
+            for ( i =0; i<4; i++ )
+            {
+                sum += v[i] * itab[i] ;
+            }
+            dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+        }
+    }
+}
+
+__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows,
+        int dst_cols, int dst_rows, int srcStep, int dstStep,
+        int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        short sx = (short)(X >> INTER_BITS) - 1;
+        short sy = (short)(Y >> INTER_BITS) - 1;
+        short ay = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+        uchar v[16];
+        int i, j;
+
+#pragma unroll 4
+        for(i=0; i<4;  i++)
+            for(j=0; j<4;  j++)
+            {
+                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0;
+            }
+
+        short itab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = 1.f/INTER_TAB_SIZE * ay;
+        axx = 1.f/INTER_TAB_SIZE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+
+        int isum = 0;
+#pragma unroll 16
+        for( i=0; i<16; i++ )
+        {
+            F v = tab1y[(i>>2)] * tab1x[(i&3)];
+            isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) );
+        }
+        if( isum != INTER_REMAP_COEF_SCALE )
+        {
+            int k1, k2;
+            int diff = isum - INTER_REMAP_COEF_SCALE;
+            int Mk1=2, Mk2=2, mk1=2, mk2=2;
+            for( k1 = 2; k1 < 4; k1++ )
+                for( k2 = 2; k2 < 4; k2++ )
+                {
+                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                        mk1 = k1, mk2 = k2;
+                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                        Mk1 = k1, Mk2 = k2;
+                }
+            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+        }
+
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            int sum=0;
+            for ( i =0; i<16; i++ )
+            {
+                sum += v[i] * itab[i] ;
+            }
+            dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+        }
+    }
+}
+
+/**********************************************8UC4*********************************************
+***********************************************************************************************/
+
+__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
+                                      int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+                                      int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? 1./W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+        short sx = (short)X;
+        short sy = (short)Y;
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0;
+    }
+}
+
+__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        src_offset = (src_offset>>2);
+        srcStep = (srcStep>>2);
+
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        short sx = (short)(X >> INTER_BITS);
+        short sy = (short)(Y >> INTER_BITS);
+        short ay = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+
+        int4 v0, v1, v2, v3;
+
+        v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0;
+        v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0;
+        v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0;
+        v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0;
+
+        int itab0, itab1, itab2, itab3;
+        float taby, tabx;
+        taby = 1.f/INTER_TAB_SIZE*ay;
+        tabx = 1.f/INTER_TAB_SIZE*ax;
+
+        itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+        itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ));
+        itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ));
+        itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE ));
+
+        int4 val;
+        val = v0 * itab0 +  v1 * itab1 + v2 * itab2 + v3 * itab3;
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] =  convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+    }
+}
+
+__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        src_offset = (src_offset>>2);
+        srcStep = (srcStep>>2);
+        dst_offset = (dst_offset>>2);
+        dstStep = (dstStep>>2);
+
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        short sx = (short)(X >> INTER_BITS) - 1;
+        short sy = (short)(Y >> INTER_BITS) - 1;
+        short ay = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+        uchar4 v[16];
+        int i,j;
+#pragma unroll 4
+        for(i=0; i<4; i++)
+            for(j=0; j<4; j++)
+            {
+                v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)])  : (uchar4)0;
+            }
+        int itab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = INTER_SCALE * ay;
+        axx = INTER_SCALE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+        int isum = 0;
+
+#pragma unroll 16
+        for( i=0; i<16; i++ )
+        {
+            float tmp;
+            tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE;
+            itab[i] = rint(tmp);
+            isum += itab[i];
+        }
+
+        if( isum != INTER_REMAP_COEF_SCALE )
+        {
+            int k1, k2;
+            int diff = isum - INTER_REMAP_COEF_SCALE;
+            int Mk1=2, Mk2=2, mk1=2, mk2=2;
+
+            for( k1 = 2; k1 < 4; k1++ )
+                for( k2 = 2; k2 < 4; k2++ )
+                {
+
+                    if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] )
+                        mk1 = k1, mk2 = k2;
+                    else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] )
+                        Mk1 = k1, Mk2 = k2;
+                }
+
+            diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff));
+        }
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            int4 sum=0;
+            for ( i =0; i<16; i++ )
+            {
+                sum += convert_int4(v[i]) * itab[i];
+            }
+            dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ;
+        }
+    }
+}
+
+
+/**********************************************32FC1********************************************
+***********************************************************************************************/
+
+__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+                                      int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                      int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? 1./W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+        short sx = (short)X;
+        short sy = (short)Y;
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0;
+    }
+}
+
+__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+        int dst_cols, int dst_rows, int srcStep, int dstStep,
+        int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        src_offset = (src_offset>>2);
+
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        short sx = (short)(X >> INTER_BITS);
+        short sy = (short)(Y >> INTER_BITS);
+        short ay = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+        float v0, v1, v2, v3;
+
+        v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0;
+        v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0;
+        v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0;
+        v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0;
+
+        float tab[4];
+        float taby[2], tabx[2];
+        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay;
+        taby[1] = 1.f/INTER_TAB_SIZE*ay;
+        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax;
+        tabx[1] = 1.f/INTER_TAB_SIZE*ax;
+
+        tab[0] = taby[0] * tabx[0];
+        tab[1] = taby[0] * tabx[1];
+        tab[2] = taby[1] * tabx[0];
+        tab[3] = taby[1] * tabx[1];
+
+        float sum = 0;
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>2)+dy*dstStep+dx] = sum;
+    }
+}
+
+__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows,
+        int dst_cols, int dst_rows, int srcStep, int dstStep,
+        int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        src_offset = (src_offset>>2);
+        dst_offset = (dst_offset>>2);
+
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        short sx = (short)(X >> INTER_BITS) - 1;
+        short sy = (short)(Y >> INTER_BITS) - 1;
+        short ay = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+        float v[16];
+        int i;
+
+        for(i=0; i<16;  i++)
+            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0;
+
+        float tab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = 1.f/INTER_TAB_SIZE * ay;
+        axx = 1.f/INTER_TAB_SIZE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+        for( i=0; i<16; i++ )
+        {
+            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+        }
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            float sum = 0;
+#pragma unroll 4
+            for ( i =0; i<16; i++ )
+            {
+                sum += v[i] * tab[i];
+            }
+            dst[dst_offset+dy*dstStep+dx] = sum;
+
+        }
+    }
+}
+
+
+/**********************************************32FC4********************************************
+***********************************************************************************************/
+
+__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+                                      int dst_cols, int dst_rows, int srcStep, int dstStep,
+                                      int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W =(W != 0.0)? 1./W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+        short sx = (short)X;
+        short sy = (short)Y;
+
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0;
+    }
+}
+
+__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows,
+        int dst_cols, int dst_rows, int srcStep, int dstStep,
+        int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows)
+    {
+        src_offset = (src_offset>>4);
+        dst_offset = (dst_offset>>4);
+        srcStep = (srcStep>>2);
+        dstStep = (dstStep>>2);
+
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        short sx0 = (short)(X >> INTER_BITS);
+        short sy0 = (short)(Y >> INTER_BITS);
+        short ay0 = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax0 = (short)(X & (INTER_TAB_SIZE-1));
+
+
+        float4 v0, v1, v2, v3;
+
+        v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0;
+        v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0;
+        v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0;
+        v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0;
+
+        float tab[4];
+        float taby[2], tabx[2];
+        taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0;
+        taby[1] = 1.f/INTER_TAB_SIZE*ay0;
+        tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0;
+        tabx[1] = 1.f/INTER_TAB_SIZE*ax0;
+
+        tab[0] = taby[0] * tabx[0];
+        tab[1] = taby[0] * tabx[1];
+        tab[2] = taby[1] * tabx[0];
+        tab[3] = taby[1] * tabx[1];
+
+        float4 sum = 0;
+        sum += v0 * tab[0] +  v1 * tab[1] +  v2 * tab[2] +  v3 * tab[3];
+        if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+            dst[dst_offset+dy*dstStep+dx] = sum;
+    }
+}
+
+__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst,
+        int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep,
+        int dstStep, int src_offset, int dst_offset,  __constant F * M, int threadCols )
+{
+    int dx = get_global_id(0);
+    int dy = get_global_id(1);
+
+    if( dx < threadCols && dy < dst_rows )
+    {
+        src_offset = (src_offset>>4);
+        dst_offset = (dst_offset>>4);
+        srcStep = (srcStep>>2);
+        dstStep = (dstStep>>2);
+
+        F X0 = M[0]*dx + M[1]*dy + M[2];
+        F Y0 = M[3]*dx + M[4]*dy + M[5];
+        F W = M[6]*dx + M[7]*dy + M[8];
+        W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0;
+        int X = rint(X0*W);
+        int Y = rint(Y0*W);
+
+        short sx = (short)(X >> INTER_BITS)-1;
+        short sy = (short)(Y >> INTER_BITS)-1;
+        short ay = (short)(Y & (INTER_TAB_SIZE-1));
+        short ax = (short)(X & (INTER_TAB_SIZE-1));
+
+
+        float4 v[16];
+        int i;
+
+        for(i=0; i<16;  i++)
+            v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0;
+
+        float tab[16];
+        float tab1y[4], tab1x[4];
+        float axx, ayy;
+
+        ayy = 1.f/INTER_TAB_SIZE * ay;
+        axx = 1.f/INTER_TAB_SIZE * ax;
+        interpolateCubic(ayy, tab1y);
+        interpolateCubic(axx, tab1x);
+
+#pragma unroll 4
+        for( i=0; i<16; i++ )
+        {
+            tab[i] = tab1y[(i>>2)] * tab1x[(i&3)];
+        }
+
+        if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows)
+        {
+            float4 sum = 0;
+#pragma unroll 4
+            for ( i =0; i<16; i++ )
+            {
+                sum += v[i] * tab[i];
+            }
+            dst[dst_offset+dy*dstStep+dx] = sum;
+
+        }
+    }
+}
--- a/modules/imgproc/src/precomp.hpp
+++ b/modules/imgproc/src/precomp.hpp
@ -48,6 +48,8 @@

 #include "opencv2/imgproc/imgproc_c.h"
 #include "opencv2/core/private.hpp"
+#include "opencv2/core/ocl.hpp"
+#include "opencl_kernels.hpp"

 #include <math.h>
 #include <assert.h>
--- a/modules/imgproc/test/test_imgproc_umat.cpp
+++ b/modules/imgproc/test/test_imgproc_umat.cpp
@ -0,0 +1,81 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "test_precomp.hpp"
+#include <string>
+
+using namespace cv;
+using namespace std;
+
+class CV_ImgprocUMatTest : public cvtest::BaseTest
+{
+public:
+    CV_ImgprocUMatTest() {}
+    ~CV_ImgprocUMatTest() {}
+protected:
+    void run(int)
+    {
+        string imgpath = string(ts->get_data_path()) + "shared/lena.png";
+        Mat img = imread(imgpath, 1), gray, smallimg, result;
+        UMat uimg = img.getUMat(ACCESS_READ), ugray, usmallimg, uresult;
+
+        cvtColor(img, gray, COLOR_BGR2GRAY);
+        resize(gray, smallimg, Size(), 0.75, 0.75, INTER_LINEAR);
+        equalizeHist(smallimg, result);
+
+        cvtColor(uimg, ugray, COLOR_BGR2GRAY);
+        resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR);
+        equalizeHist(usmallimg, uresult);
+
+        imshow("orig", uimg);
+        imshow("small", usmallimg);
+        imshow("equalized gray", uresult);
+        waitKey();
+        destroyWindow("orig");
+        destroyWindow("small");
+        destroyWindow("equalized gray");
+
+        ts->set_failed_test_info(cvtest::TS::OK);
+    }
+};
+
+TEST(Imgproc_UMat, regression) { CV_ImgprocUMatTest test; test.safe_run(); }
--- a/modules/nonfree/src/precomp.hpp
+++ b/modules/nonfree/src/precomp.hpp
@ -52,6 +52,8 @@
 #include "opencv2/nonfree/cuda.hpp"
 #include "opencv2/core/private.cuda.hpp"

+#include "opencv2/core/ocl.hpp"
+
 #include "opencv2/opencv_modules.hpp"

 #ifdef HAVE_OPENCV_CUDAARITHM
--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@ -51,6 +51,8 @@
 using namespace cv;
 using namespace cv::ocl;

+static ProgramEntry surf = cv::ocl::nonfree::surf;
+
 namespace cv
 {
    namespace ocl
--- a/modules/objdetect/include/opencv2/objdetect.hpp
+++ b/modules/objdetect/include/opencv2/objdetect.hpp
@ -159,14 +159,14 @@ public:
    CV_WRAP virtual bool empty() const;
    CV_WRAP bool load( const String& filename );
    virtual bool read( const FileNode& node );
-    CV_WRAP virtual void detectMultiScale( const Mat& image,
+    CV_WRAP virtual void detectMultiScale( InputArray image,
                                   CV_OUT std::vector<Rect>& objects,
                                   double scaleFactor = 1.1,
                                   int minNeighbors = 3, int flags = 0,
                                   Size minSize = Size(),
                                   Size maxSize = Size() );

-    CV_WRAP virtual void detectMultiScale( const Mat& image,
+    CV_WRAP virtual void detectMultiScale( InputArray image,
                                   CV_OUT std::vector<Rect>& objects,
                                   CV_OUT std::vector<int>& numDetections,
                                   double scaleFactor=1.1,
@ -174,7 +174,7 @@ public:
                                   Size minSize=Size(),
                                   Size maxSize=Size() );

-    CV_WRAP virtual void detectMultiScale( const Mat& image,
+    CV_WRAP virtual void detectMultiScale( InputArray image,
                                   CV_OUT std::vector<Rect>& objects,
                                   CV_OUT std::vector<int>& rejectLevels,
                                   CV_OUT std::vector<double>& levelWeights,
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@ -1154,13 +1154,14 @@ void CascadeClassifier::detectMultiScaleNoGrouping( const Mat& image, std::vecto
    }
 }

-void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
+void CascadeClassifier::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
                                          std::vector<int>& rejectLevels,
                                          std::vector<double>& levelWeights,
                                          double scaleFactor, int minNeighbors,
                                          int flags, Size minObjectSize, Size maxObjectSize,
                                          bool outputRejectLevels )
 {
+    Mat image = _image.getMat();
    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );

    if( empty() )
@ -1188,21 +1189,23 @@ void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& o
    }
 }

-void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
+void CascadeClassifier::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
                                          double scaleFactor, int minNeighbors,
                                          int flags, Size minObjectSize, Size maxObjectSize)
 {
+    Mat image = _image.getMat();
    std::vector<int> fakeLevels;
    std::vector<double> fakeWeights;
    detectMultiScale( image, objects, fakeLevels, fakeWeights, scaleFactor,
        minNeighbors, flags, minObjectSize, maxObjectSize );
 }

-void CascadeClassifier::detectMultiScale( const Mat& image, std::vector<Rect>& objects,
+void CascadeClassifier::detectMultiScale( InputArray _image, std::vector<Rect>& objects,
                                          std::vector<int>& numDetections, double scaleFactor,
                                          int minNeighbors, int flags, Size minObjectSize,
                                          Size maxObjectSize )
 {
+    Mat image = _image.getMat();
    CV_Assert( scaleFactor > 1 && image.depth() == CV_8U );

    if( empty() )
--- a/modules/objdetect/src/opencl/haarobjectdetect.cl
+++ b/modules/objdetect/src/opencl/haarobjectdetect.cl
@ -0,0 +1,423 @@
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Niko Li, newlife20080214@gmail.com
+//    Wang Weiyan, wangweiyanster@gmail.com
+//    Jia Haipeng, jiahaipeng95@gmail.com
+//    Nathan, liujun@multicorewareinc.com
+//    Peng Xiao, pengxiao@outlook.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#define CV_HAAR_FEATURE_MAX           3
+
+#define calc_sum(rect,offset)        (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset])
+#define calc_sum1(rect,offset,i)     (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset])
+
+typedef int   sumtype;
+typedef float sqsumtype;
+
+#ifndef STUMP_BASED
+#define STUMP_BASED 1
+#endif
+
+typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode
+{
+    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64)));
+    float weight[CV_HAAR_FEATURE_MAX];
+    float threshold;
+    float alpha[3] __attribute__((aligned (16)));
+    int left __attribute__((aligned (4)));
+    int right __attribute__((aligned (4)));
+}
+GpuHidHaarTreeNode;
+
+
+typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier
+{
+    int count __attribute__((aligned (4)));
+    GpuHidHaarTreeNode* node __attribute__((aligned (8)));
+    float* alpha __attribute__((aligned (8)));
+}
+GpuHidHaarClassifier;
+
+
+typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier
+{
+    int  count __attribute__((aligned (4)));
+    float threshold __attribute__((aligned (4)));
+    int two_rects __attribute__((aligned (4)));
+    int reserved0 __attribute__((aligned (8)));
+    int reserved1 __attribute__((aligned (8)));
+    int reserved2 __attribute__((aligned (8)));
+    int reserved3 __attribute__((aligned (8)));
+}
+GpuHidHaarStageClassifier;
+
+
+typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
+{
+    int  count __attribute__((aligned (4)));
+    int  is_stump_based __attribute__((aligned (4)));
+    int  has_tilted_features __attribute__((aligned (4)));
+    int  is_tree __attribute__((aligned (4)));
+    int pq0 __attribute__((aligned (4)));
+    int pq1 __attribute__((aligned (4)));
+    int pq2 __attribute__((aligned (4)));
+    int pq3 __attribute__((aligned (4)));
+    int p0 __attribute__((aligned (4)));
+    int p1 __attribute__((aligned (4)));
+    int p2 __attribute__((aligned (4)));
+    int p3 __attribute__((aligned (4)));
+    float inv_window_area __attribute__((aligned (4)));
+} GpuHidHaarClassifierCascade;
+
+__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
+    global GpuHidHaarStageClassifier * stagecascadeptr,
+    global int4 * info,
+    global GpuHidHaarTreeNode * nodeptr,
+    global const int * restrict sum1,
+    global const float * restrict sqsum1,
+    global int4 * candidate,
+    const int pixelstep,
+    const int loopcount,
+    const int start_stage,
+    const int split_stage,
+    const int end_stage,
+    const int startnode,
+    const int splitnode,
+    const int4 p,
+    const int4 pq,
+    const float correction)
+{
+    int grpszx = get_local_size(0);
+    int grpszy = get_local_size(1);
+    int grpnumx = get_num_groups(0);
+    int grpidx = get_group_id(0);
+    int lclidx = get_local_id(0);
+    int lclidy = get_local_id(1);
+
+    int lcl_sz = mul24(grpszx,grpszy);
+    int lcl_id = mad24(lclidy,grpszx,lclidx);
+
+    __local int lclshare[1024];
+    __local int* lcldata = lclshare;//for save win data
+    __local int* glboutindex = lcldata + 28*28;//for save global out index
+    __local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel
+    __local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel
+    __local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1));
+    glboutindex[0]=0;
+    int outputoff = mul24(grpidx,256);
+
+    //assume window size is 20X20
+#define WINDOWSIZE 20+1
+    //make sure readwidth is the multiple of 4
+    //ystep =1, from host code
+    int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2;
+    int readheight = grpszy-1+WINDOWSIZE;
+    int read_horiz_cnt = readwidth >> 2;//each read int4
+    int total_read = mul24(read_horiz_cnt,readheight);
+    int read_loop = (total_read + lcl_sz - 1) >> 6;
+    candidate[outputoff+(lcl_id<<2)] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+1] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+2] = (int4)0;
+    candidate[outputoff+(lcl_id<<2)+3] = (int4)0;
+    for(int scalei = 0; scalei <loopcount; scalei++)
+    {
+        int4 scaleinfo1= info[scalei];
+        int width = (scaleinfo1.x & 0xffff0000) >> 16;
+        int height = scaleinfo1.x & 0xffff;
+        int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16;
+        int totalgrp = scaleinfo1.y & 0xffff;
+        int imgoff = scaleinfo1.z;
+        float factor = as_float(scaleinfo1.w);
+
+        __global const int * sum = sum1 + imgoff;
+        __global const float * sqsum = sqsum1 + imgoff;
+        for(int grploop=grpidx; grploop<totalgrp; grploop+=grpnumx)
+        {
+            int grpidy = grploop / grpnumperline;
+            int grpidx = grploop - mul24(grpidy, grpnumperline);
+            int x = mad24(grpidx,grpszx,lclidx);
+            int y = mad24(grpidy,grpszy,lclidy);
+            int grpoffx = x-lclidx;
+            int grpoffy = y-lclidy;
+
+            for(int i=0; i<read_loop; i++)
+            {
+                int pos_id = mad24(i,lcl_sz,lcl_id);
+                pos_id = pos_id < total_read ? pos_id : 0;
+
+                int lcl_y = pos_id / read_horiz_cnt;
+                int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt);
+
+                int glb_x = grpoffx + (lcl_x<<2);
+                int glb_y = grpoffy + lcl_y;
+
+                int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x);
+                int4 data = *(__global int4*)&sum[glb_off];
+                int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2);
+
+                vstore4(data, 0, &lcldata[lcl_off]);
+            }
+
+            lcloutindex[lcl_id] = 0;
+            lclcount[0] = 0;
+            int result = 1;
+            int nodecounter= startnode;
+            float mean, variance_norm_factor;
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            int lcl_off = mad24(lclidy,readwidth,lclidx);
+            int4 cascadeinfo1, cascadeinfo2;
+            cascadeinfo1 = p;
+            cascadeinfo2 = pq;
+
+            cascadeinfo1.x +=lcl_off;
+            cascadeinfo1.z +=lcl_off;
+            mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] -
+                    lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)])
+                    *correction;
+
+            int p_offset = mad24(y, pixelstep, x);
+
+            cascadeinfo2.x +=p_offset;
+            cascadeinfo2.z +=p_offset;
+            variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] -
+                                    sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)];
+
+            variance_norm_factor = variance_norm_factor * correction - mean * mean;
+            variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f;
+
+            for(int stageloop = start_stage; (stageloop < split_stage)  && result; stageloop++ )
+            {
+                float stage_sum = 0.f;
+                int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+                float stagethreshold = as_float(stageinfo.y);
+                for(int nodeloop = 0; nodeloop < stageinfo.x; )
+                {
+                    __global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter);
+
+                    int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+                    int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+                    int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+                    float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+                    float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
+
+                    float nodethreshold  = w.w * variance_norm_factor;
+
+                    info1.x +=lcl_off;
+                    info1.z +=lcl_off;
+                    info2.x +=lcl_off;
+                    info2.z +=lcl_off;
+
+                    float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
+                                        lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+
+                    classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
+                                    lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+
+                    info3.x +=lcl_off;
+                    info3.z +=lcl_off;
+                    classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
+                                    lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+
+                    bool passThres = classsum >= nodethreshold;
+#if STUMP_BASED
+                    stage_sum += passThres ? alpha3.y : alpha3.x;
+                    nodecounter++;
+                    nodeloop++;
+#else
+                    bool isRootNode = (nodecounter & 1) == 0;
+                    if(isRootNode)
+                    {
+                        if( (passThres && currentnodeptr->right) ||
+                            (!passThres && currentnodeptr->left))
+                        {
+                            nodecounter ++;
+                        }
+                        else
+                        {
+                            stage_sum += alpha3.x;
+                            nodecounter += 2;
+                            nodeloop ++;
+                        }
+                    }
+                    else
+                    {
+                        stage_sum += passThres ? alpha3.z : alpha3.y;
+                        nodecounter ++;
+                        nodeloop ++;
+                    }
+#endif
+                }
+
+                result = (stage_sum >= stagethreshold);
+            }
+
+            if(result && (x < width) && (y < height))
+            {
+                int queueindex = atomic_inc(lclcount);
+                lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx;
+                lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+            int queuecount  = lclcount[0];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            nodecounter = splitnode;
+            for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++)
+            {
+                lclcount[0]=0;
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+                float stagethreshold = as_float(stageinfo.y);
+
+                int perfscale = queuecount > 4 ? 3 : 2;
+                int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale;
+                int lcl_compute_win = lcl_sz >> perfscale;
+                int lcl_compute_win_id = (lcl_id >>(6-perfscale));
+                int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale);
+                int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale));
+                for(int queueloop=0; queueloop<queuecount_loop; queueloop++)
+                {
+                    float stage_sum = 0.f;
+                    int temp_coord = lcloutindex[lcl_compute_win_id<<1];
+                    float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]);
+                    int queue_pixel = mad24(((temp_coord  & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff);
+
+                    if(lcl_compute_win_id < queuecount)
+                    {
+                        int tempnodecounter = lcl_compute_id;
+                        float part_sum = 0.f;
+                        const int stump_factor = STUMP_BASED ? 1 : 2;
+                        int root_offset = 0;
+                        for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;)
+                        {
+                            __global GpuHidHaarTreeNode* currentnodeptr =
+                                nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset;
+
+                            int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0]));
+                            int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0]));
+                            int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0]));
+                            float4 w = *(__global float4*)(&(currentnodeptr->weight[0]));
+                            float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0]));
+                            float nodethreshold  = w.w * variance_norm_factor;
+
+                            info1.x +=queue_pixel;
+                            info1.z +=queue_pixel;
+                            info2.x +=queue_pixel;
+                            info2.z +=queue_pixel;
+
+                            float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] -
+                                                lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x;
+
+
+                            classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] -
+                                            lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y;
+
+                            info3.x +=queue_pixel;
+                            info3.z +=queue_pixel;
+                            classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] -
+                                            lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z;
+
+                            bool passThres = classsum >= nodethreshold;
+#if STUMP_BASED
+                            part_sum += passThres ? alpha3.y : alpha3.x;
+                            tempnodecounter += lcl_compute_win;
+                            lcl_loop++;
+#else
+                            if(root_offset == 0)
+                            {
+                                if( (passThres && currentnodeptr->right) ||
+                                    (!passThres && currentnodeptr->left))
+                                {
+                                    root_offset = 1;
+                                }
+                                else
+                                {
+                                    part_sum += alpha3.x;
+                                    tempnodecounter += lcl_compute_win;
+                                    lcl_loop++;
+                                }
+                            }
+                            else
+                            {
+                                part_sum += passThres ? alpha3.z : alpha3.y;
+                                tempnodecounter += lcl_compute_win;
+                                lcl_loop++;
+                                root_offset = 0;
+                            }
+#endif
+                        }//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++)
+                        partialsum[lcl_id]=part_sum;
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                    if(lcl_compute_win_id < queuecount)
+                    {
+                        for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++)
+                        {
+                            stage_sum += partialsum[lcl_id+i];
+                        }
+                        if(stage_sum >= stagethreshold && (lcl_compute_id==0))
+                        {
+                            int queueindex = atomic_inc(lclcount);
+                            lcloutindex[queueindex<<1] = temp_coord;
+                            lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor);
+                        }
+                        lcl_compute_win_id +=(1<<perfscale);
+                    }
+                    barrier(CLK_LOCAL_MEM_FENCE);
+                }//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++)
+
+                queuecount = lclcount[0];
+                barrier(CLK_LOCAL_MEM_FENCE);
+                nodecounter += stageinfo.x;
+            }//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++)
+
+            if(lcl_id<queuecount)
+            {
+                int temp = lcloutindex[lcl_id<<1];
+                int x = mad24(grpidx,grpszx,temp & 0xffff);
+                int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16));
+                temp = glboutindex[0];
+                int4 candidate_result;
+                candidate_result.zw = (int2)convert_int_rtn(factor*20.f);
+                candidate_result.x = convert_int_rtn(x*factor);
+                candidate_result.y = convert_int_rtn(y*factor);
+                atomic_inc(glboutindex);
+                candidate[outputoff+temp+lcl_id] = candidate_result;
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
+    }//end for(int scalei = 0; scalei <loopcount; scalei++)
+}
--- a/modules/objdetect/src/opencl/haarobjectdetect_scaled2.cl
+++ b/modules/objdetect/src/opencl/haarobjectdetect_scaled2.cl
@ -0,0 +1,306 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Wu Xinglong, wxl370@126.com
+//    Sen Liu, swjtuls1987@126.com
+//    Peng Xiao, pengxiao@outlook.com
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+// Enter your kernel in this window
+//#pragma OPENCL EXTENSION cl_amd_printf:enable
+#define CV_HAAR_FEATURE_MAX           3
+typedef int   sumtype;
+typedef float sqsumtype;
+
+typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode
+{
+    int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64)));
+    float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/;
+    float threshold /*__attribute__((aligned (4)))*/;
+    float alpha[3] __attribute__((aligned(16)));
+    int left __attribute__((aligned(4)));
+    int right __attribute__((aligned(4)));
+}
+GpuHidHaarTreeNode;
+typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier
+{
+    int count __attribute__((aligned(4)));
+    GpuHidHaarTreeNode *node __attribute__((aligned(8)));
+    float *alpha __attribute__((aligned(8)));
+}
+GpuHidHaarClassifier;
+typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier
+{
+    int  count __attribute__((aligned(4)));
+    float threshold __attribute__((aligned(4)));
+    int two_rects __attribute__((aligned(4)));
+    int reserved0 __attribute__((aligned(8)));
+    int reserved1 __attribute__((aligned(8)));
+    int reserved2 __attribute__((aligned(8)));
+    int reserved3 __attribute__((aligned(8)));
+}
+GpuHidHaarStageClassifier;
+typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade
+{
+    int  count __attribute__((aligned(4)));
+    int  is_stump_based __attribute__((aligned(4)));
+    int  has_tilted_features __attribute__((aligned(4)));
+    int  is_tree __attribute__((aligned(4)));
+    int pq0 __attribute__((aligned(4)));
+    int pq1 __attribute__((aligned(4)));
+    int pq2 __attribute__((aligned(4)));
+    int pq3 __attribute__((aligned(4)));
+    int p0 __attribute__((aligned(4)));
+    int p1 __attribute__((aligned(4)));
+    int p2 __attribute__((aligned(4)));
+    int p3 __attribute__((aligned(4)));
+    float inv_window_area __attribute__((aligned(4)));
+} GpuHidHaarClassifierCascade;
+
+__kernel void gpuRunHaarClassifierCascade_scaled2(
+    global GpuHidHaarStageClassifier *stagecascadeptr,
+    global int4 *info,
+    global GpuHidHaarTreeNode *nodeptr,
+    global const int *restrict sum,
+    global const float   *restrict sqsum,
+    global int4 *candidate,
+    const int rows,
+    const int cols,
+    const int step,
+    const int loopcount,
+    const int start_stage,
+    const int split_stage,
+    const int end_stage,
+    const int startnode,
+    global int4 *p,
+    global float *correction,
+    const int nodecount)
+{
+    int grpszx = get_local_size(0);
+    int grpszy = get_local_size(1);
+    int grpnumx = get_num_groups(0);
+    int grpidx = get_group_id(0);
+    int lclidx = get_local_id(0);
+    int lclidy = get_local_id(1);
+    int lcl_sz = mul24(grpszx, grpszy);
+    int lcl_id = mad24(lclidy, grpszx, lclidx);
+    __local int glboutindex[1];
+    __local int lclcount[1];
+    __local int lcloutindex[64];
+    glboutindex[0] = 0;
+    int outputoff = mul24(grpidx, 256);
+    candidate[outputoff + (lcl_id << 2)] = (int4)0;
+    candidate[outputoff + (lcl_id << 2) + 1] = (int4)0;
+    candidate[outputoff + (lcl_id << 2) + 2] = (int4)0;
+    candidate[outputoff + (lcl_id << 2) + 3] = (int4)0;
+    int max_idx = rows * cols - 1;
+    for (int scalei = 0; scalei < loopcount; scalei++)
+    {
+        int4 scaleinfo1;
+        scaleinfo1 = info[scalei];
+        int width = (scaleinfo1.x & 0xffff0000) >> 16;
+        int height = scaleinfo1.x & 0xffff;
+        int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16;
+        int totalgrp = scaleinfo1.y & 0xffff;
+        float factor = as_float(scaleinfo1.w);
+        float correction_t = correction[scalei];
+        int ystep = (int)(max(2.0f, factor) + 0.5f);
+
+        for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx)
+        {
+            int4 cascadeinfo = p[scalei];
+            int grpidy = grploop / grpnumperline;
+            int grpidx = grploop - mul24(grpidy, grpnumperline);
+            int ix = mad24(grpidx, grpszx, lclidx);
+            int iy = mad24(grpidy, grpszy, lclidy);
+            int x = ix * ystep;
+            int y = iy * ystep;
+            lcloutindex[lcl_id] = 0;
+            lclcount[0] = 0;
+            int nodecounter;
+            float mean, variance_norm_factor;
+            //if((ix < width) && (iy < height))
+            {
+                const int p_offset = mad24(y, step, x);
+                cascadeinfo.x += p_offset;
+                cascadeinfo.z += p_offset;
+                mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
+                - sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                        sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+                + sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)])
+                       * correction_t;
+                variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)]
+                - sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] -
+                                       sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)]
+                + sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)];
+                variance_norm_factor = variance_norm_factor * correction_t - mean * mean;
+                variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f;
+                bool result = true;
+                nodecounter = startnode + nodecount * scalei;
+                for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++)
+                {
+                    float stage_sum = 0.f;
+                    int   stagecount = stagecascadeptr[stageloop].count;
+                    for (int nodeloop = 0; nodeloop < stagecount;)
+                    {
+                        __global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter);
+                        int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0]));
+                        int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0]));
+                        int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0]));
+                        float4 w = *(__global float4 *)(&(currentnodeptr->weight[0]));
+                        float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0]));
+                        float nodethreshold  = w.w * variance_norm_factor;
+
+                        info1.x += p_offset;
+                        info1.z += p_offset;
+                        info2.x += p_offset;
+                        info2.z += p_offset;
+                        info3.x += p_offset;
+                        info3.z += p_offset;
+                        float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)]
+                        - sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] -
+                                          sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)]
+                        + sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x;
+                        classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)]
+                        - sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)]
+                        + sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y;
+                        classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)]
+                        - sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] -
+                                     sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)]
+                        + sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z;
+
+                        bool passThres = classsum >= nodethreshold;
+
+#if STUMP_BASED
+                        stage_sum += passThres ? alpha3.y : alpha3.x;
+                        nodecounter++;
+                        nodeloop++;
+#else
+                        bool isRootNode = (nodecounter & 1) == 0;
+                        if(isRootNode)
+                        {
+                            if( (passThres && currentnodeptr->right) ||
+                                (!passThres && currentnodeptr->left))
+                            {
+                                nodecounter ++;
+                            }
+                            else
+                            {
+                                stage_sum += alpha3.x;
+                                nodecounter += 2;
+                                nodeloop ++;
+                            }
+                        }
+                        else
+                        {
+                            stage_sum += (passThres ? alpha3.z : alpha3.y);
+                            nodecounter ++;
+                            nodeloop ++;
+                        }
+#endif
+                    }
+                    result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold);
+                }
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+
+                if (result && (ix < width) && (iy < height))
+                {
+                    int queueindex = atomic_inc(lclcount);
+                    lcloutindex[queueindex] = (y << 16) | x;
+                }
+                barrier(CLK_LOCAL_MEM_FENCE);
+                int queuecount = lclcount[0];
+
+                if (lcl_id < queuecount)
+                {
+                    int temp = lcloutindex[lcl_id];
+                    int x = temp & 0xffff;
+                    int y = (temp & (int)0xffff0000) >> 16;
+                    temp = atomic_inc(glboutindex);
+                    int4 candidate_result;
+                    candidate_result.zw = (int2)convert_int_rtn(factor * 20.f);
+                    candidate_result.x = x;
+                    candidate_result.y = y;
+                    candidate[outputoff + temp + lcl_id] = candidate_result;
+                }
+
+                barrier(CLK_LOCAL_MEM_FENCE);
+            }
+        }
+    }
+}
+__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum)
+{
+    int counter = get_global_id(0);
+    int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0;
+    GpuHidHaarTreeNode t1 = *(orinode + counter);
+#pragma unroll
+
+    for (i = 0; i < 3; i++)
+    {
+        tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f);
+        tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f);
+        tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f);
+        tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f);
+    }
+
+    t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]);
+    counter += nodenum;
+#pragma unroll
+
+    for (i = 0; i < 3; i++)
+    {
+        newnode[counter].p[i][0] = tr_x[i];
+        newnode[counter].p[i][1] = tr_y[i];
+        newnode[counter].p[i][2] = tr_x[i] + tr_w[i];
+        newnode[counter].p[i][3] = tr_y[i] + tr_h[i];
+        newnode[counter].weight[i] = t1.weight[i] * weight_scale;
+    }
+
+    newnode[counter].left = t1.left;
+    newnode[counter].right = t1.right;
+    newnode[counter].threshold = t1.threshold;
+    newnode[counter].alpha[0] = t1.alpha[0];
+    newnode[counter].alpha[1] = t1.alpha[1];
+    newnode[counter].alpha[2] = t1.alpha[2];
+}
--- a/modules/objdetect/src/precomp.hpp
+++ b/modules/objdetect/src/precomp.hpp
@ -49,6 +49,7 @@
 #include "opencv2/ml.hpp"

 #include "opencv2/core/utility.hpp"
+#include "opencv2/core/ocl.hpp"

 #include "opencv2/opencv_modules.hpp"
 #ifdef HAVE_OPENCV_HIGHGUI
--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@ -47,6 +47,7 @@
 #define __OPENCV_OCL_PRIVATE_UTIL__

 #include "opencv2/ocl/cl_runtime/cl_runtime.hpp"
+#include "opencv2/core/ocl_genbase.hpp"

 #include "opencv2/ocl.hpp"

@ -55,13 +56,6 @@ namespace cv
 namespace ocl
 {

-struct ProgramEntry
-{
-    const char* name;
-    const char* programStr;
-    const char* programHash;
-};
-
 inline cl_device_id getClDeviceID(const Context *ctx)
 {
    return *(cl_device_id*)(ctx->getOpenCLDeviceIDPtr());
--- a/modules/superres/src/btv_l1_ocl.cpp
+++ b/modules/superres/src/btv_l1_ocl.cpp
@ -64,6 +64,8 @@ using namespace cv::ocl;
 using namespace cv::superres;
 using namespace cv::superres::detail;

+static ProgramEntry superres_btvl1 = cv::ocl::superres::superres_btvl1;
+
 namespace cv
 {
    namespace ocl
--- a/modules/superres/src/precomp.hpp
+++ b/modules/superres/src/precomp.hpp
@ -56,6 +56,7 @@
 #include "opencv2/core/private.hpp"

 #include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/ocl.hpp"

 #ifdef HAVE_OPENCV_CUDAARITHM
 #  include "opencv2/cudaarithm.hpp"
--- a/samples/cpp/ufacedetect.cpp
+++ b/samples/cpp/ufacedetect.cpp
@ -0,0 +1,276 @@
+#include "opencv2/objdetect.hpp"
+#include "opencv2/highgui.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/ocl.hpp"
+
+#include <cctype>
+#include <iostream>
+#include <iterator>
+#include <stdio.h>
+
+using namespace std;
+using namespace cv;
+
+static void help()
+{
+    cout << "\nThis program demonstrates the cascade recognizer. Now you can use Haar or LBP features.\n"
+            "This classifier can recognize many kinds of rigid objects, once the appropriate classifier is trained.\n"
+            "It's most known use is for faces.\n"
+            "Usage:\n"
+            "./facedetect [--cascade=<cascade_path> this is the primary trained classifier such as frontal face]\n"
+               "   [--nested-cascade[=nested_cascade_path this an optional secondary classifier such as eyes]]\n"
+               "   [--scale=<image scale greater or equal to 1, try 1.3 for example>]\n"
+               "   [--try-flip]\n"
+               "   [filename|camera_index]\n\n"
+            "see facedetect.cmd for one call:\n"
+            "./facedetect --cascade=\"../../data/haarcascades/haarcascade_frontalface_alt.xml\" --nested-cascade=\"../../data/haarcascades/haarcascade_eye.xml\" --scale=1.3\n\n"
+            "During execution:\n\tHit any key to quit.\n"
+            "\tUsing OpenCV version " << CV_VERSION << "\n" << endl;
+}
+
+void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
+                    CascadeClassifier& nestedCascade,
+                    double scale, bool tryflip );
+
+string cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml";
+string nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml";
+
+int main( int argc, const char** argv )
+{
+    VideoCapture capture;
+    UMat frame, image;
+    Mat canvas;
+    const string scaleOpt = "--scale=";
+    size_t scaleOptLen = scaleOpt.length();
+    const string cascadeOpt = "--cascade=";
+    size_t cascadeOptLen = cascadeOpt.length();
+    const string nestedCascadeOpt = "--nested-cascade";
+    size_t nestedCascadeOptLen = nestedCascadeOpt.length();
+    const string tryFlipOpt = "--try-flip";
+    size_t tryFlipOptLen = tryFlipOpt.length();
+    String inputName;
+    bool tryflip = false;
+
+    help();
+
+    CascadeClassifier cascade, nestedCascade;
+    double scale = 1;
+
+    for( int i = 1; i < argc; i++ )
+    {
+        cout << "Processing " << i << " " <<  argv[i] << endl;
+        if( cascadeOpt.compare( 0, cascadeOptLen, argv[i], cascadeOptLen ) == 0 )
+        {
+            cascadeName.assign( argv[i] + cascadeOptLen );
+            cout << "  from which we have cascadeName= " << cascadeName << endl;
+        }
+        else if( nestedCascadeOpt.compare( 0, nestedCascadeOptLen, argv[i], nestedCascadeOptLen ) == 0 )
+        {
+            if( argv[i][nestedCascadeOpt.length()] == '=' )
+                nestedCascadeName.assign( argv[i] + nestedCascadeOpt.length() + 1 );
+            if( !nestedCascade.load( nestedCascadeName ) )
+                cerr << "WARNING: Could not load classifier cascade for nested objects" << endl;
+        }
+        else if( scaleOpt.compare( 0, scaleOptLen, argv[i], scaleOptLen ) == 0 )
+        {
+            if( !sscanf( argv[i] + scaleOpt.length(), "%lf", &scale ) || scale > 1 )
+                scale = 1;
+            cout << " from which we read scale = " << scale << endl;
+        }
+        else if( tryFlipOpt.compare( 0, tryFlipOptLen, argv[i], tryFlipOptLen ) == 0 )
+        {
+            tryflip = true;
+            cout << " will try to flip image horizontally to detect assymetric objects\n";
+        }
+        else if( argv[i][0] == '-' )
+        {
+            cerr << "WARNING: Unknown option %s" << argv[i] << endl;
+        }
+        else
+            inputName = argv[i];
+    }
+
+    if( !cascade.load( cascadeName ) )
+    {
+        cerr << "ERROR: Could not load classifier cascade" << endl;
+        help();
+        return -1;
+    }
+
+    if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') )
+    {
+        int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0';
+        if(!capture.open(c))
+            cout << "Capture from camera #" <<  c << " didn't work" << endl;
+    }
+    else
+    {
+        if( inputName.empty() )
+            inputName = "lena.jpg";
+        image = imread( inputName, 1 ).getUMat(ACCESS_READ);
+        if( image.empty() )
+        {
+            if(!capture.open( inputName ))
+                cout << "Could not read " << inputName << endl;
+        }
+    }
+
+    namedWindow( "result", 1 );
+
+    if( capture.isOpened() )
+    {
+        cout << "Video capturing has been started ..." << endl;
+        for(;;)
+        {
+            capture >> frame;
+            if( frame.empty() )
+                break;
+
+            detectAndDraw( frame, canvas, cascade, nestedCascade, scale, tryflip );
+
+            if( waitKey( 10 ) >= 0 )
+                break;
+        }
+    }
+    else
+    {
+        cout << "Detecting face(s) in " << inputName << endl;
+        if( !image.empty() )
+        {
+            detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip );
+            waitKey(0);
+        }
+        else if( !inputName.empty() )
+        {
+            /* assume it is a text file containing the
+            list of the image filenames to be processed - one per line */
+            FILE* f = fopen( inputName.c_str(), "rt" );
+            if( f )
+            {
+                char buf[1000+1];
+                while( fgets( buf, 1000, f ) )
+                {
+                    int len = (int)strlen(buf), c;
+                    while( len > 0 && isspace(buf[len-1]) )
+                        len--;
+                    buf[len] = '\0';
+                    cout << "file " << buf << endl;
+                    image = imread( buf, 1 ).getUMat(ACCESS_READ);
+                    if( !image.empty() )
+                    {
+                        detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip );
+                        c = waitKey(0);
+                        if( c == 27 || c == 'q' || c == 'Q' )
+                            break;
+                    }
+                    else
+                    {
+                        cerr << "Aw snap, couldn't read image " << buf << endl;
+                    }
+                }
+                fclose(f);
+            }
+        }
+    }
+
+    return 0;
+}
+
+void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade,
+                    CascadeClassifier& nestedCascade,
+                    double scale0, bool tryflip )
+{
+    int i = 0;
+    double t = 0, scale=1;
+    vector<Rect> faces, faces2;
+    const static Scalar colors[] =
+    {
+        Scalar(0,0,255),
+        Scalar(0,128,255),
+        Scalar(0,255,255),
+        Scalar(0,255,0),
+        Scalar(255,128,0),
+        Scalar(255,255,0),
+        Scalar(255,0,0),
+        Scalar(255,0,255)
+    };
+    static UMat gray, smallImg;
+
+    t = (double)getTickCount();
+
+    cvtColor( img, gray, COLOR_BGR2GRAY );
+    resize( gray, smallImg, Size(), scale0, scale0, INTER_LINEAR );
+    cvtColor(smallImg, canvas, COLOR_GRAY2BGR);
+    equalizeHist( smallImg, smallImg );
+    
+    cascade.detectMultiScale( smallImg, faces,
+        1.1, 2, 0
+        //|CASCADE_FIND_BIGGEST_OBJECT
+        //|CASCADE_DO_ROUGH_SEARCH
+        |CASCADE_SCALE_IMAGE
+        ,
+        Size(30, 30) );
+    if( tryflip )
+    {
+        flip(smallImg, smallImg, 1);
+        cascade.detectMultiScale( smallImg, faces2,
+                                 1.1, 2, 0
+                                 //|CASCADE_FIND_BIGGEST_OBJECT
+                                 //|CASCADE_DO_ROUGH_SEARCH
+                                 |CASCADE_SCALE_IMAGE
+                                 ,
+                                 Size(30, 30) );
+        for( vector<Rect>::const_iterator r = faces2.begin(); r != faces2.end(); r++ )
+        {
+            faces.push_back(Rect(smallImg.cols - r->x - r->width, r->y, r->width, r->height));
+        }
+    }
+    t = (double)getTickCount() - t;
+    cvtColor(smallImg, canvas, COLOR_GRAY2BGR);
+
+    double fps = getTickFrequency()/t;
+
+    putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50),
+            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
+    
+    for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
+    {
+        vector<Rect> nestedObjects;
+        Point center;
+        Scalar color = colors[i%8];
+        int radius;
+
+        double aspect_ratio = (double)r->width/r->height;
+        if( 0.75 < aspect_ratio && aspect_ratio < 1.3 )
+        {
+            center.x = cvRound((r->x + r->width*0.5)*scale);
+            center.y = cvRound((r->y + r->height*0.5)*scale);
+            radius = cvRound((r->width + r->height)*0.25*scale);
+            circle( canvas, center, radius, color, 3, 8, 0 );
+        }
+        else
+            rectangle( canvas, Point(cvRound(r->x*scale), cvRound(r->y*scale)),
+                       Point(cvRound((r->x + r->width-1)*scale), cvRound((r->y + r->height-1)*scale)),
+                       color, 3, 8, 0);
+        if( nestedCascade.empty() )
+            continue;
+        UMat smallImgROI = smallImg(*r);
+        nestedCascade.detectMultiScale( smallImgROI, nestedObjects,
+            1.1, 2, 0
+            //|CASCADE_FIND_BIGGEST_OBJECT
+            //|CASCADE_DO_ROUGH_SEARCH
+            //|CASCADE_DO_CANNY_PRUNING
+            |CASCADE_SCALE_IMAGE
+            ,
+            Size(30, 30) );
+        for( vector<Rect>::const_iterator nr = nestedObjects.begin(); nr != nestedObjects.end(); nr++ )
+        {
+            center.x = cvRound((r->x + nr->x + nr->width*0.5)*scale);
+            center.y = cvRound((r->y + nr->y + nr->height*0.5)*scale);
+            radius = cvRound((nr->width + nr->height)*0.25*scale);
+            circle( canvas, center, radius, color, 3, 8, 0 );
+        }
+    }
+    imshow( "result", canvas );
+}
--- a/samples/ocl/facedetect.cpp
+++ b/samples/ocl/facedetect.cpp
@ -11,7 +11,7 @@

 using namespace std;
 using namespace cv;
-#define LOOP_NUM 10
+#define LOOP_NUM 1

 const static Scalar colors[] =  { CV_RGB(0,0,255),
                                  CV_RGB(0,128,255),
@ -83,7 +83,7 @@ int main( int argc, const char** argv )
    }

    CvCapture* capture = 0;
-    Mat frame, frameCopy, image;
+    Mat frame, frameCopy0, frameCopy, image;

    bool useCPU = cmd.get<bool>("s");
    string inputName = cmd.get<string>("i");
@ -129,16 +129,21 @@ int main( int argc, const char** argv )
            if( frame.empty() )
                break;
            if( iplImg->origin == IPL_ORIGIN_TL )
-                frame.copyTo( frameCopy );
+                frame.copyTo( frameCopy0 );
            else
-                flip( frame, frameCopy, 0 );
+                flip( frame, frameCopy0, 0 );
+            if( scale == 1)
+                frameCopy0.copyTo(frameCopy);
+            else
+                resize(frameCopy0, frameCopy, Size(), 1./scale, 1./scale, INTER_LINEAR);

+            work_end = 0;
            if(useCPU)
-                detectCPU(frameCopy, faces, cpu_cascade, scale, false);
+                detectCPU(frameCopy, faces, cpu_cascade, 1, false);
            else
-                detect(frameCopy, faces, cascade, scale, false);
+                detect(frameCopy, faces, cascade, 1, false);

-            Draw(frameCopy, faces, scale);
+            Draw(frameCopy, faces, 1);
            if( waitKey( 10 ) >= 0 )
                break;
        }
@ -150,6 +155,7 @@ int main( int argc, const char** argv )
        vector<Rect> faces;
        vector<Rect> ref_rst;
        double accuracy = 0.;
+        work_end = 0;
        for(int i = 0; i <= LOOP_NUM; i ++)
        {
            cout << "loop" << i << endl;
@ -188,7 +194,7 @@ void detect( Mat& img, vector<Rect>& faces,
 {
    ocl::oclMat image(img);
    ocl::oclMat gray, smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
-    if(calTime) workBegin();
+    workBegin();
    ocl::cvtColor( image, gray, COLOR_BGR2GRAY );
    ocl::resize( gray, smallImg, smallImg.size(), 0, 0, INTER_LINEAR );
    ocl::equalizeHist( smallImg, smallImg );
@ -197,14 +203,14 @@ void detect( Mat& img, vector<Rect>& faces,
                              3, 0
                              |CASCADE_SCALE_IMAGE
                              , Size(30,30), Size(0, 0) );
-    if(calTime) workEnd();
+    workEnd();
 }

 void detectCPU( Mat& img, vector<Rect>& faces,
                CascadeClassifier& cascade,
                double scale, bool calTime)
 {
-    if(calTime) workBegin();
+    workBegin();
    Mat cpu_gray, cpu_smallImg( cvRound (img.rows/scale), cvRound(img.cols/scale), CV_8UC1 );
    cvtColor(img, cpu_gray, COLOR_BGR2GRAY);
    resize(cpu_gray, cpu_smallImg, cpu_smallImg.size(), 0, 0, INTER_LINEAR);
@ -212,13 +218,15 @@ void detectCPU( Mat& img, vector<Rect>& faces,
    cascade.detectMultiScale(cpu_smallImg, faces, 1.1,
                             3, 0 | CASCADE_SCALE_IMAGE,
                             Size(30, 30), Size(0, 0));
-    if(calTime) workEnd();
+    workEnd();
 }


 void Draw(Mat& img, vector<Rect>& faces, double scale)
 {
    int i = 0;
+    putText(img, format("fps: %.1f", 1000./getTime()), Point(450, 50),
+            FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3);
    for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
    {
        Point center;
@ -229,7 +237,7 @@ void Draw(Mat& img, vector<Rect>& faces, double scale)
        radius = cvRound((r->width + r->height)*0.25*scale);
        circle( img, center, radius, color, 3, 8, 0 );
    }
-    imwrite( outputName, img );
+    //imwrite( outputName, img );
    if(abs(scale-1.0)>.001)
    {
        resize(img, img, Size((int)(img.cols/scale), (int)(img.rows/scale)));